Best Python code snippet using lettuce_webdriver_python
BiblioParsingUtils.py
Source:BiblioParsingUtils.py
1__all__ = ['biblio_parser',2 'build_institutions_dic',3 'build_title_keywords',4 'check_and_drop_columns',5 'country_normalization',6 'extend_author_institutions',7 'getting_secondary_inst_list',8 'merge_database',9 'name_normalizer',10 'normalize_journal_names',11 'setting_secondary_inst_filter',12 'upgrade_col_names',13 ]14# 15# Globals used from BiblioAnalysis_Utils.BiblioGeneralGlobals: ALIAS_UK, CHANGE, COUNTRIES,16# Globals used from BiblioAnalysis_Utils.BiblioSpecificGlobals: BLACKLISTED_WORDS, COL_NAMES,17# DIC_INST_FILENAME, DIC_LOW_WORDS, DIC_OUTDIR_PARSING , 18# INST_FILTER_LIST, REP_UTILS, 19# NLTK_VALID_TAG_LIST, NOUN_MINIMUM_OCCURRENCES,20# RE_NUM_CONF, RE_YEAR_JOURNAL,21# SCOPUS, USECOLS_SCOPUS, WOS22# Functions used from BiblioAnalysis_Utils.BiblioGui: Select_multi_items23# Functions used from BiblioAnalysis_Utils.BiblioParsingScopus: biblio_parser_scopus24# Functions used from BiblioAnalysis_Utils.BiblioParsingWos: biblio_parser_wos25def build_title_keywords(df):26 27 '''Given the dataframe 'df' with one column 'title':28 29 Title30 0 Experimental and CFD investigation of inert be...31 1 Impact of Silicon/Graphite Composite Electrode...32 33 the function 'build_title_keywords':34 35 1- Builds the set "keywords_TK" of the tokens appearing at least NOUN_MINIMUM_OCCURRENCE times 36 in all the article titles of the corpus. The tokens are the words of the title with nltk tags 37 belonging to the global list 'NLTK_VALID_TAG_LIST'.38 2- Adds two columns 'token' and 'pub_token' to the dataframe 'df'. The column 'token' contains39 the set of the tokenized and lemmelized (using the nltk WordNetLemmatizer) title. The column40 'pub_token' contains the list of words common to the set "keywords_TK" and to the column 'kept_tokens'41 3- Buids the list of tuples 'list_of_words_occurrences.sort'42 [(token_1,# occurrences token_1), (token_2,# occurrences token_2),...] ordered by decreasing values43 of # occurrences token_i.44 4- Suppress words pertening to BLACKLISTED_WORDS to the list from the bag of words45 46 Args:47 df (dataframe): pub_id | Title 48 49 Returns:50 df (dataframe): pub_id | title_token | kept_tokens where title_token is the list of token of the title51 and kept_token the list of tokens with a frequency occurrence >= NOUN_MINIMUM_OCCURRENCES52 bag_of_words_occurrences (list of tuples): [(word_1,# occurrence_1), (word_2,# occurrence_2), ...]53 54 '''55 # Standard library imports56 import operator57 from collections import Counter58 59 # 3rd party imports60 import nltk61 import numpy as np62 63 # Local imports64 from BiblioAnalysis_Utils.BiblioSpecificGlobals import NLTK_VALID_TAG_LIST65 from BiblioAnalysis_Utils.BiblioSpecificGlobals import NOUN_MINIMUM_OCCURRENCES66 from BiblioAnalysis_Utils.BiblioSpecificGlobals import BLACKLISTED_WORDS67 68 def tokenizer(text):69 70 '''71 Tokenizes, lemmelizes the string 'text'. Only the words with nltk tags in the global72 NLTK_VALID_TAG_LIST are kept.73 74 ex 'Thermal stability of Mg2Si0.55Sn0.45 for thermoelectric applications' 75 gives the list : ['thermal', 'stability', 'mg2si0.55sn0.45', 'thermoelectric', 'application']76 77 Args:78 text (string): string to tokenize79 80 Returns81 The list valid_words_lemmatized 82 '''83 84 tokenized = nltk.word_tokenize(text.lower())85 valid_words = [word for (word, pos) in nltk.pos_tag(tokenized) 86 if pos in NLTK_VALID_TAG_LIST] 87 stemmer = nltk.stem.WordNetLemmatizer()88 valid_words_lemmatized = [stemmer.lemmatize(valid_word) for valid_word in valid_words]89 90 return valid_words_lemmatized 91 df['title_token'] = df['Title'].apply(tokenizer)92 bag_of_words = np.array(df.title_token.sum()) # remove the blacklisted words from the bag of words93 for remove in BLACKLISTED_WORDS:94 bag_of_words = bag_of_words[bag_of_words != remove] 95 bag_of_words_occurrences = list(Counter(bag_of_words).items())96 bag_of_words_occurrences.sort(key = operator.itemgetter(1),reverse=True)97 keywords_TK = set([x for x,y in bag_of_words_occurrences if y>=NOUN_MINIMUM_OCCURRENCES])98 99 df['kept_tokens'] = df['title_token'].apply(lambda x :list(keywords_TK.intersection(set(x))))100 101 return df,bag_of_words_occurrences102def country_normalization(country):103 '''104 Normalize the country name for coherence seeking between wos and scopus corpuses.105 '''106 # Local imports107 from BiblioAnalysis_Utils.BiblioGeneralGlobals import ALIAS_UK108 from BiblioAnalysis_Utils.BiblioGeneralGlobals import COUNTRIES109 110 country_clean = country111 if country not in COUNTRIES:112 if country in ALIAS_UK:113 country_clean = 'United Kingdom'114 elif 'USA' in country:115 country_clean = 'United States'116 elif ('china' in country) or ('China' in country):117 country_clean = 'China'118 elif country == 'Russia': 119 country_clean = 'Russian Federation'120 elif country == 'U Arab Emirates': 121 country_clean = 'United Arab Emirates'122 elif country == 'Vietnam': 123 country_clean = 'Viet Nam'124 else:125 country_clean = ''126 return country_clean127def build_institutions_dic(rep_utils = None, dic_inst_filename = None):128 '''129 The `builds_institutions_dic` fuction builds the dict 'inst_dic' 130 giving the mormalized names of institutions from a csv file `dic_inst_filename`.131 The name of the csv file is set in the `DIC_INST_FILENAME` global.132 133 Args: 134 rep_utils (str): name of the folder where the csv file is stored135 dic_inst_filename (str): name of the csv file. 136 137 Returns: 138 `dict`: `inst_dic` as {raw_inst:norm_inst} where 139 - raw_inst a raw institution name 140 - norm_inst is the normalized institution name.141 142 Note:143 The globals `REP_UTILS` and `DIC_INST_FILENAME` are used.144 145 '''146 # Standard library imports147 from pathlib import Path148 149 # 3rd party imports150 import pandas as pd151 152 # Local imports153 from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_INST_FILENAME154 from BiblioAnalysis_Utils.BiblioSpecificGlobals import REP_UTILS 155 156 if dic_inst_filename == None: dic_inst_filename = DIC_INST_FILENAME157 if rep_utils == None: rep_utils = REP_UTILS 158 159 # Setting the file path for dic_inst_filename file reading 160 path_dic_inst = Path(__file__).parent / rep_utils / Path(dic_inst_filename)161 162 # Reading and cleaning the dic_inst_filename file163 inst_dic = pd.read_csv(path_dic_inst,sep=':',header=None,encoding='latin1')164 inst_dic.sort_values([0],inplace=True)165 inst_dic[0] = inst_dic[0].str.strip()166 inst_dic[1] = inst_dic[1].str.strip()167 inst_dic = dict(zip(inst_dic[0],inst_dic[1]))168 169 return inst_dic170def setting_secondary_inst_filter(out_dir_parsing):171 '''The `setting_secondary_inst_filter` function allows building the affiliation filter "inst_filter_list"172 fron the institutions list of the corpus using the `Select_multi_items` GUI.173 174 Args:175 out_dir_parsing (path): the corpus parsing path for reading the "DIC_OUTDIR_PARSING['I2']" file.176 177 Returns:178 (list): list of tuples (institution,country) selected by the user.179 180 Notes:181 The globals 'COL_NAMES'and 'DIC_OUTDIR_PARSING' are used.182 The function `Select_multi_items`is used from `BiblioAnalysis_utils` package.183 184 '''185 186 # Standard library imports187 from pathlib import Path188 189 # 3rd party imports190 import numpy as np191 import pandas as pd192 193 # Local imports194 from BiblioAnalysis_Utils.BiblioGui import Select_multi_items195 from BiblioAnalysis_Utils.BiblioSpecificGlobals import COL_NAMES196 from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_OUTDIR_PARSING197 198 199 institutions_alias = COL_NAMES['auth_inst'][4]200 country_alias = COL_NAMES['country'][2] 201 202 df_auth_inst = pd.read_csv(Path(out_dir_parsing) / Path(DIC_OUTDIR_PARSING['I2']),203 sep = '\t')204 raw_institutions_list = []205 for auth_inst in df_auth_inst[institutions_alias]:206 raw_institutions_list.append(auth_inst)207 208 institutions_list = list(np.concatenate([raw_inst.split(';') for raw_inst in raw_institutions_list]))209 institutions_list = sorted(list(set(institutions_list)))210 country_institution_list = [x.split('_')[1] + ':' + x.split('_')[0] for x in institutions_list]211 country_institution_list = sorted(country_institution_list)212 selected_list = Select_multi_items(country_institution_list,213 mode='multiple',214 fact=2,215 win_widthmm=80,216 win_heightmm=100,217 font_size=16)218 inst_filter_list = [(x.split(':')[1].strip(),x.split(':')[0].strip()) for x in selected_list]219 220 return inst_filter_list221def merge_database(database,filename,in_dir,out_dir):222 223 '''Merges several databases in one database224 225 Args:226 database (string): database type (scopus or wos)227 filename (str): name of the merged database228 in_dir (str): name of the folder where the databases are stored229 out_dir (str): name of the folder where the merged databases will be stored230 231 Notes:232 The USECOLS_SCOPUS global is used.233 234 '''235 # Standard library imports236 import os237 from pathlib import Path238 import sys239 # 3rd party imports240 import pandas as pd241 242 # Local imports243 from BiblioAnalysis_Utils.BiblioSpecificGlobals import SCOPUS244 from BiblioAnalysis_Utils.BiblioSpecificGlobals import USECOLS_SCOPUS245 from BiblioAnalysis_Utils.BiblioSpecificGlobals import WOS246 list_data_base = []247 list_df = []248 if database == WOS:249 for path, _, files in os.walk(in_dir):250 list_data_base.extend(Path(path) / Path(file) for file in files251 if file.endswith(".txt"))252 for file in list_data_base:253 list_df.append(read_database_wos(file))254 elif database == SCOPUS:255 for path, _, files in os.walk(in_dir):256 list_data_base.extend(Path(path) / Path(file) for file in files257 if file.endswith(".csv"))258 for file in list_data_base:259 df = pd.read_csv(file,usecols=USECOLS_SCOPUS) # reads the database260 list_df.append(df)261 262 else:263 raise Exception(f"Sorry, unrecognized database {database} : should be {WOS} or {SCOPUS} ")264 265 result = pd.concat(list_df,ignore_index=True)266 result.to_csv(out_dir / Path(filename),sep='\t')267def name_normalizer(text):268 269 '''Normalizes the author name spelling according the three debatable rules:270 - replacing none ascii letters by ascii ones271 - capitalizing first name 272 - capitalizing surnames273 - supressing comma and dot274 275 ex: name_normalizer(" GrÃÅ-biçà -vèLU D'aillön, E-kj. ")276 >>> "Grol-Bica-Velu D'Aillon E-KJ"277 278 Args:279 text (str): text to normalize280 281 Returns282 The normalized text283 284 Notes:285 The CHANGE global is used.286 '''287 # Standard library imports288 import functools289 import re290 import unicodedata291 292 # Local imports293 from BiblioAnalysis_Utils.BiblioGeneralGlobals import CHANGE294 nfc = functools.partial(unicodedata.normalize,'NFD')295 296 text = text.translate(CHANGE) # Translate special character using global CHANGE dict297 text = nfc(text). \298 encode('ascii', 'ignore'). \299 decode('utf-8').\300 strip()301 302 re_minus = re.compile('(-[a-zA-Z]+)') # Captures: "cCc-cC-ccc-CCc"303 for text_minus_texts in re.findall(re_minus,text):304 text = text.replace(text_minus_texts,'-' + text_minus_texts[1:].capitalize() )305 306 re_apostrophe = re.compile("('[a-zA-Z]+)") # Captures: "cCc'cC'ccc'cc'CCc"307 for text_minus_texts in re.findall(re_apostrophe,text):308 text = text.replace(text_minus_texts,"'" + text_minus_texts[1:].capitalize() )309 310 re_minus = re.compile('([a-zA-Z]+-)') # Captures: "cCc-" 311 for text_minus_texts in re.findall(re_minus,text):312 text = text.replace(text_minus_texts,text_minus_texts[:-1].capitalize() + '-')313 314 re_apostrophe = re.compile("([a-zA-Z]+')") # Captures: "cCc'"315 for text_minus_texts in re.findall(re_apostrophe,text):316 text = text.replace(text_minus_texts,text_minus_texts[:-1].capitalize() + "'")317 318 re_surname = "[a-zA-Z]+\s" # Captures: "cCccC "319 for text_minus_texts in re.findall(re_surname,text):320 text = text.replace(text_minus_texts,text_minus_texts.capitalize())321 322 re_minus_first_name = '\s[a-zA-Z]+-[a-zA-Z]+$' # Captures: "cCc-cC" in the first name323 for x in re.findall(re_minus_first_name,text):324 text = text.replace(x,x.upper())325 326 return text327def normalize_journal_names(database,df_corpus):328 '''The `normalize_journal_names` function normalizes the journal names in the journals specific column 329 of the corpus dataframe through the replace of low words defined in the global 'DIC_LOW_WORDS' 330 and the drop of particular items using the regular expressions defined by 'RE_ADDS_JOURNAL' and 'RE_YEAR_JOURNAL'331 globals.332 333 Args:334 database (string): type of database among the ones defined by SCOPUS and WOS globals.335 df_corpus (dataframe): corpus dataframe to be normalized in terms of journal names.336 337 Returns:338 (dataframe): the dataframe with normalized journal names.339 340 Note:341 The globals 'COLUMN_LABEL_WOS', 'COLUMN_LABEL_SCOPUS','DIC_LOW_WORDS', 'RE_YEAR_JOURNAL', 'SCOPUS' and 'WOS' are used.342 343 '''344 # Standard library imports345 import re346 347 # Local imports348 from BiblioAnalysis_Utils.BiblioSpecificGlobals import COLUMN_LABEL_WOS 349 from BiblioAnalysis_Utils.BiblioSpecificGlobals import COLUMN_LABEL_SCOPUS350 from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_LOW_WORDS351 from BiblioAnalysis_Utils.BiblioSpecificGlobals import RE_NUM_CONF352 from BiblioAnalysis_Utils.BiblioSpecificGlobals import RE_YEAR_JOURNAL353 from BiblioAnalysis_Utils.BiblioSpecificGlobals import SCOPUS354 from BiblioAnalysis_Utils.BiblioSpecificGlobals import WOS355 356 357 def _normalize_low_words(text): 358 for low_word in DIC_LOW_WORDS.keys():359 text = text.replace(low_word, DIC_LOW_WORDS[low_word]).strip()360 text = " ".join(text.split())361 return text362 def _journal_normalizer(journal):363 journal = ' ' + journal + ' '364 journal = journal.lower()365 journal_list = [" " + x + " " for x in journal.split()]366 new_journal = " ".join(journal_list)367 if RE_YEAR_JOURNAL.findall(journal) or RE_NUM_CONF.findall(journal): 368 to_remove = [x for x in journal_list if (RE_YEAR_JOURNAL.findall(x) or RE_NUM_CONF.findall(x))]369 for x in to_remove: new_journal = new_journal.replace(x,'')370 new_journal = " ".join(new_journal.split())371 new_journal = _normalize_low_words(new_journal) 372 return new_journal373 374 if database == WOS:375 journal_alias = COLUMN_LABEL_WOS['journal']376 elif database == SCOPUS:377 journal_alias = COLUMN_LABEL_SCOPUS['journal']378 else:379 raise Exception(f"Sorry, unrecognized database {database}: should be {WOS} or {SCOPUS} ") 380 381 df_corpus[journal_alias] = df_corpus[journal_alias].apply(_journal_normalizer)382 383 return df_corpus384def biblio_parser(in_dir_parsing, out_dir_parsing, database, expert, rep_utils=None, inst_filter_list=None):385 386 '''Chooses the appropriate parser to parse wos or scopus databases.387 '''388 389 # Local imports390 from BiblioAnalysis_Utils.BiblioParsingScopus import biblio_parser_scopus391 from BiblioAnalysis_Utils.BiblioParsingWos import biblio_parser_wos392 from BiblioAnalysis_Utils.BiblioSpecificGlobals import INST_FILTER_LIST393 from BiblioAnalysis_Utils.BiblioSpecificGlobals import REP_UTILS394 from BiblioAnalysis_Utils.BiblioSpecificGlobals import SCOPUS395 from BiblioAnalysis_Utils.BiblioSpecificGlobals import WOS396 397 if database == WOS:398 biblio_parser_wos(in_dir_parsing, out_dir_parsing, inst_filter_list)399 elif database == SCOPUS:400 if rep_utils == None: rep_utils = REP_UTILS401 biblio_parser_scopus(in_dir_parsing, out_dir_parsing, rep_utils, inst_filter_list)402 else:403 raise Exception(f"Sorry, unrecognized database {database} : should be wos or scopus ")404 405def check_and_drop_columns(database,df,filename):406 # Local imports407 from BiblioAnalysis_Utils.BiblioSpecificGlobals import COLUMN_LABEL_WOS 408 from BiblioAnalysis_Utils.BiblioSpecificGlobals import COLUMN_LABEL_SCOPUS409 from BiblioAnalysis_Utils.BiblioSpecificGlobals import SCOPUS410 from BiblioAnalysis_Utils.BiblioSpecificGlobals import WOS411 # Check for missing mandatory columns412 if database == WOS:413 cols_mandatory = set([val for val in COLUMN_LABEL_WOS.values() if val])414 elif database == SCOPUS:415 cols_mandatory = set([val for val in COLUMN_LABEL_SCOPUS.values() if val]) 416 else:417 raise Exception(f"Sorry, unrecognized database {database} : should be {WOS} or {SCOPUS} ")418 419 cols_available = set(df.columns)420 missing_columns = cols_mandatory.difference(cols_available)421 if missing_columns:422 raise Exception(f'The mandarory columns: {",".join(missing_columns)} are missing from {filename}\nplease correct before proceeding')423 424 # Columns selection and dataframe reformatting425 cols_to_drop = list(cols_available.difference(cols_mandatory))426 df.drop(cols_to_drop,427 axis=1,428 inplace=True) # Drops unused columns429 df.index = range(len(df)) # Sets the pub_id in df index430 431 return df432 433def upgrade_col_names(corpus_folder):434 435 '''Add names to the colummn of the parsing and filter_<i> files to take into account the436 upgrage of BiblioAnalysis_Utils.437 438 Args:439 corpus_folder (str): folder of the corpus to be adapted440 '''441 # Standard library imports442 import os443 444 # 3rd party imports445 import colorama446 import pandas as pd447 from colorama import Back448 from colorama import Fore449 from colorama import Style450 from pandas.core.groupby.groupby import DataError451 452 # Local imports453 from BiblioAnalysis_Utils.BiblioSpecificGlobals import COL_NAMES454 455 # Beware: the new file authorsinst.dat is not present in the old parsing folders456 dict_filename_conversion = {'addresses.dat':'address',457 'articles.dat': 'articles',458 'authors.dat':'authors',459 'authorsinst.dat':'auth_inst',460 'authorskeywords.dat':'keywords',461 'countries.dat':'country',462 'institutions.dat':'institution',463 'journalkeywords.dat':'keywords',464 'references.dat':'references',465 'subjects.dat': 'subject',466 'subjects2.dat':'sub_subject',467 'titlekeywords.dat':'keywords'}468 for dirpath, dirs, files in os.walk(corpus_folder): 469 if ('parsing' in dirpath) | ('filter_' in dirpath):470 for file in [file for file in files471 if (file.split('.')[1]=='dat') 472 and (file!='database.dat') # Not used this file is no longer generated473 and (file!='keywords.dat') ]: # Not used this file is no longer generated474 try:475 df = pd.read_csv(os.path.join(dirpath,file),sep='\t',header=None)476 477 if df.loc[0].tolist() == COL_NAMES[dict_filename_conversion[file]]:478 print(f'The file {os.path.join(dirpath,file)} is up to date')479 else:480 df.columns = COL_NAMES[dict_filename_conversion[file]]481 df.to_csv(os.path.join(dirpath,file),sep='\t',index=False)482 print(Fore.GREEN + f'*** The file {os.path.join(dirpath,file)} has been upgraded ***' + Style.RESET_ALL)483 except pd.errors.EmptyDataError:484 df = pd.DataFrame(columns=COL_NAMES[dict_filename_conversion[file]])485 df.to_csv(os.path.join(dirpath,file),sep='\t',index=False)486 print(Fore.BLUE + f'*** The EMPTY file {os.path.join(dirpath,file)} has been upgraded ***' + Style.RESET_ALL)487 except:488 print(Fore.WHITE + Back.RED + f'Warning: File {os.path.join(dirpath,file)} not recognized as a parsing file' + Style.RESET_ALL)489 490def extend_author_institutions(in_dir,inst_filter_list):491 ''' The `extend_author_institutions`function extends the .dat file of authors with institutions 492 initialy obtained by the parsing of the corpus, with complementary information about institutions493 selected by the user.494 495 Args:496 in_dir (path): path to the .dat file of authors with institutions497 inst_filter_list (list): the affiliation filter list of tuples (institution, country) 498 Retruns:499 None500 501 Notes:502 The globals 'COL_NAMES' and 'DIC_OUTDIR_PARSING' are used503 from `BiblioAnalysis_utils` package.504 505 '''506 507 # Standard library imports508 from pathlib import Path509 510 # 3rd party imports511 import pandas as pd512 513 # Local imports514 from BiblioAnalysis_Utils.BiblioSpecificGlobals import COL_NAMES515 from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_OUTDIR_PARSING516 517 def _address_inst_list(inst_names_list,institutions):518 secondary_institutions = []519 for inst in inst_names_list:520 if inst in institutions:521 secondary_institutions.append(1)522 else:523 secondary_institutions.append(0) 524 525 return secondary_institutions526 527 institutions_alias = COL_NAMES['auth_inst'][4]528 sec_institutions_alias = COL_NAMES['auth_inst'][5]529 530 # Setting the key for the name of the '.dat' file of authors with institutions 531 # obtained by parsing the corpus532 item = 'I2' 533 534 # Reading the '.dat' file 535 read_usecols = [COL_NAMES['auth_inst'][x] for x in [0,1,2,3,4]] 536 df_I2= pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING[item]),537 sep='\t',538 usecols=read_usecols)539 540 # Setting an institution name for each of the institutions indicated in the institutions filter541 inst_names_list = [f'{x[0]}_{x[1]}' for x in inst_filter_list] 542 543 # Building the "sec_institution_alias" column in the 'df_I2' dataframe using "inst_filter_list"544 df_I2[sec_institutions_alias] = df_I2.apply(lambda row:545 _address_inst_list(inst_names_list,row[institutions_alias]),546 axis = 1)547 # Distributing in a 'df_inst_split' df the value lists of 'df_I2[sec_institutions_alias]' column 548 # into columns which names are in 'inst_names_list' list 549 df_inst_split = pd.DataFrame(df_I2[sec_institutions_alias].sort_index().to_list(),550 columns=inst_names_list)551 552 # Extending the 'df' dataframe with 'df_inst_split' dataframe553 df_I2 = pd.concat([df_I2, df_inst_split], axis=1)554 # Droping the 'df[sec_institutions_alias]' column which is no more usefull555 df_I2.drop([sec_institutions_alias], axis=1, inplace=True)556 557 # Saving the extended 'df_I2' dataframe in the same '.dat' file 558 df_I2.to_csv(in_dir/ Path(DIC_OUTDIR_PARSING[item]), 559 index=False,560 sep='\t') 561 562 563def getting_secondary_inst_list(out_dir_parsing):564 '''The `getting_secondary_inst_list` function provides the list of institutions of the corpus.565 566 Args:567 out_dir_parsing (path): the corpus parsing path for reading the "DIC_OUTDIR_PARSING['I2']" file 568 that lists the authors with their institutions for each article.569 570 Returns:571 (list): list of strings 'country:institution'572 573 Notes:574 The globals 'COL_NAMES'and 'DIC_OUTDIR_PARSING' are used. 575 '''576 577 # Standard library imports578 from pathlib import Path579 580 # 3rd party imports581 import numpy as np582 import pandas as pd583 584 # Local imports585 from BiblioAnalysis_Utils.BiblioSpecificGlobals import COL_NAMES586 from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_OUTDIR_PARSING587 588 589 institutions_alias = COL_NAMES['auth_inst'][4]590 country_alias = COL_NAMES['country'][2] 591 592 df_auth_inst = pd.read_csv(Path(out_dir_parsing) / Path(DIC_OUTDIR_PARSING['I2']),593 sep = '\t')594 raw_institutions_list = []595 for auth_inst in df_auth_inst[institutions_alias]:596 raw_institutions_list.append(auth_inst)597 598 institutions_list = list(np.concatenate([raw_inst.split(';') for raw_inst in raw_institutions_list]))599 institutions_list = sorted(list(set(institutions_list)))600 601 country_institution_list = [x.split('_')[1] + ':' + x.split('_')[0] for x in institutions_list]602 country_institution_list = sorted(country_institution_list)603 604 return country_institution_list605 606 ...
BiblioFilter.py
Source:BiblioFilter.py
1__all__ = ['filter_corpus_new',2 'filters_modification',3 'item_filter_modification',4 'item_values_list',5 'read_config_filters',]6# Functions used from BiblioAnalysis_Utils.BiblioGui: Select_multi_items, filter_item_selection7# Globals used from BiblioAnalysis_Utils.BiblioSpecificGlobals: DIC_OUTDIR_PARSING8def filter_corpus_new(in_dir, out_dir, verbose, file_config_filters):9 10 '''Filters the 11 '''12 # Reads the fitering parameters13 combine,exclusion,filter_param = read_config_filters(file_config_filters)14 15 # Builds the set of articles id to keep16 tokeep = _filter_pub_id(combine,exclusion,filter_param,in_dir)17 18 # Stores the filtered files 19 _save_filtered_files(tokeep,in_dir,out_dir)20def read_config_filters(file_config):21 """22 Parse json file to build the filtering configuration23 24 Args:25 file_config (Path): absolute path of the configuration file26 27 Returns:28 combine (str):29 filter_param (dict): {key:list of keywords}30 """31 # Standard library imports32 import json33 from collections import defaultdict34 filter_param = defaultdict(list)35 with open(file_config, "r") as read_file:36 config_filter = json.load(read_file)37 combine = config_filter["COMBINE"]38 exclusion = config_filter["EXCLUSION"]39 for key, value in config_filter.items():40 if isinstance(value, dict):41 if value['mode']:42 filter_param[key] = value["list"]43 return combine,exclusion,filter_param44def _filter_pub_id(combine,exclusion,filter_param,in_dir):45 '''<--------------------- modifié AC46 This function finds the set of the identifiers (pub_id) of the publications47 that satisfy sorting criteria. 48 Args:49 combine (string) : "intersection" or "union"; defines the combination 50 of the sets of the kept pub_id by item key51 52 exclusion (bool): if true the complementary set of the kept pub_id set53 resulting from combination is returned54 filter_param (dict): {item key: [list of items to keep]}55 ex {"CU":["France","Italy"]}56 Returns:57 tokeep (set): set of kept publications id58 '''59 # Standard library imports60 import os61 import re62 from pathlib import Path63 from string import Template64 # 3rd party imports65 import pandas as pd66 67 # Local imports68 from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_OUTDIR_PARSING69 from BiblioAnalysis_Utils.BiblioSpecificGlobals import COL_NAMES70 71 filter_on = list(filter_param.keys()) # List of items to be filtered72 73 t = Template('$colname in @$item') # Template for the query74 keepid = {}75 # Builds keepid[Y]={Y}, keepid[J]={J}, keepid[DT]={DT}, keepid[LA]={LA}76 # where {Y}, {J}, {DT} and {LA} are the sets of pub_id of articles with77 # Ymin>=Year>=Ymax, with Journal in filter_param["J"],78 # with doctypes in filter_param["J"] and with Language (LA) in filter_param["LA"]79 #----------------------------------------------------------------------------80 81 for idx, item in enumerate(set(filter_on) & set(["Y","J","DT","LA"])):82 if idx == 0: # The first round we read the data83 df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING['A']),84 sep='\t',85 dtype={x : 'str' for x in COL_NAMES['articles'][1:]})86 87 if item == 'Y': #years selection88 year = [str(x) for x in filter_param['Y']]89 query = t.substitute({'colname':df.columns[2],90 'item':'year'})91 keepid[item] = set(df.query(query)[df.columns[0]]) 92 93 elif item == 'J': #journal selection94 journals = filter_param['J']95 query = t.substitute({'colname':df.columns[3],96 'item':'journals'}) 97 keepid[item] = set(df.query(query)[df.columns[0]]) 98 99 elif item == 'DT': #document type selection100 doctypes = filter_param['DT']101 query = t.substitute({'colname':df.columns[7],102 'item':'doctypes'}) 103 keepid[item] = set(df.query(query)[df.columns[0]]) 104 elif item == 'LA': #language selection105 languages = filter_param['LA']106 query = t.substitute({'colname':df.columns[8],107 'item':'languages'}) 108 keepid[item] = set(df.query(query)[df.columns[0]]) 109 110 # Builds keepid[IK]={IK} keepid[TK]={TK} keepid[AK]={AK} 111 # where {IK}, {TK}, {AK} are the sets of pub_id of articles with112 # one keyword repectivelly in filter_param["IK"], filter_param["TK"], filter_param["AK"]113 # ---------------------------------------------------------------114 if "IK" in filter_on:115 df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["IK"]), sep='\t')116 keywords = filter_param["IK"]117 query = t.substitute({'colname':df.columns[1],118 'item':'keywords'})119 keepid['IK'] = set(df.query(query)[df.columns[0]])120 121 if "AK" in filter_on:122 df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["AK"]), sep='\t')123 keywords = filter_param["AK"]124 query = t.substitute({'colname':df.columns[1],125 'item':'keywords'})126 keepid['AK'] = set(df.query(query)[df.columns[0]]) 127 if "TK" in filter_on:128 df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["TK"]), sep='\t')129 keywords = filter_param["TK"]130 query = t.substitute({'colname':df.columns[1],131 'item':'keywords'})132 keepid['TK'] = set(df.query(query)[df.columns[0]]) 133 134 # Builds keepid[AU]={AU} where {AU} is the set of pub_id 135 # of articles with at least one coauthors in the list filter_param["AU"]136 # ------------------------------------------------------------137 if "AU" in filter_on:138 df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["AU"]), sep='\t')139 authors = filter_param["AU"]140 query = t.substitute({'colname':df.columns[2],141 'item':'authors'})142 keepid['AU'] = set(df.query(query)[df.columns[0]]) 143 # Builds keepid[CU]={CU} where {CU} is the of pub_id 144 # of articles with at least one coauthor country in the list filter_param["CU"]145 # ------------------------------------------------------------146 if "CU" in filter_on:147 df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["CU"]), sep='\t')148 countries = filter_param["CU"]149 query = t.substitute({'colname':df.columns[2],150 'item':'countries'})151 keepid["CU"] = set(df.query(query)[df.columns[0]]) 152 # Builds keepid[I]={I} where {I} is the of pub_id 153 # of articles with at least one coauthor institution in the list filter_param["CU"]154 # ------------------------------------------------------------155 if "I" in filter_on:156 df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["I"]), sep='\t')157 institutions = filter_param["I"]158 query = t.substitute({'colname':df.columns[2],159 'item':'institutions'})160 keepid["I"] = set(df.query(query)[df.columns[0]]) 161 # Builds keepid[S]={S} where {S} is the of pub_id 162 # of articles with subjects in the list filter_param["S"]163 # ------------------------------------------------------------164 if "S" in filter_on:165 df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["S"]), sep='\t')166 subjects = filter_param["S"]167 query = t.substitute({'colname':df.columns[1],168 'item':'subjects'})169 keepid["S"] = set(df.query(query)[df.columns[0]]) 170 # Builds keepid[S2]={S2} where {S2} is the of pub_id 171 # of articles with subsubjects in the list filter_param["S2"]172 # ------------------------------------------------------------173 if "S2" in filter_on:174 df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["S2"]), sep='\t')175 subsubjects = filter_param["S2"]176 query = t.substitute({'colname':df.columns[1],177 'item':'subsubjects'}) 178 keepid["S2"] = set(df.query(query)[df.columns[0]]) 179 180 # Builds keepid[R]={R}, keepid[RJ]={RJ}181 # where {R} is the set acticles id with references 182 # in the list filter_param["R"]183 # {RJ} is the set acticles id with references journal 184 # in the list filter_param["RJ"]185 # of articles with references 186 # ------------------------------------------------------------187 if ("R" in filter_on) or ("RJ" in filter_on):188 df = pd.read_csv(in_dirg / Path(DIC_OUTDIR_PARSING["R"]), sep='\t').astype(str)189 190 if "R" in filter_on:191 find_0 = re.compile(r',\s?0')192 df['ref'] = df.apply(lambda row:re.sub(find_0,'', ', '.join(row[1:-1]))193 ,axis=1)194 references = filter_param["R"]195 keepid["R"] = set(df.query(query)[df.columns[0]]) 196 197 if "RJ" in filter_on:198 refsources = filter_param["RJ"]199 query = t.substitute({'colname':df.columns[3],200 'item':'refsources'})201 202 keepid["RJ"] = set(df.query(query)[df.columns[0]]) 203 204 # Combines the filtering conditions union / intersection /exclusion205 # -------------------------------------------------------------------206 tokeep = [value for value in keepid.values()] # list of kept id sets207 if combine == "intersection":208 tokeep = set.intersection(*tokeep)209 if combine == "union":210 tokeep = set.union(*tokeep)211 if exclusion: 212 df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING['A']),sep='\t')213 set_pub_id = set(df[df.columns[0]])214 # set of all pub_id215 tokeep = set_pub_id.difference(tokeep)216 217 return tokeep218 219def _save_filtered_files(tokeep,in_dir,out_dir):220 221 '''Filters all the files with ".dat" extension located in the folder in_dir #<---------------------222 and saves the filtered files in the folder out_dir_. #<---------------------223 The set "tokeep" contains the id (pu_id) of the articles to be kept in the filtered corpus. #<--------------224 225 Args:226 tokeep (set): set of id227 in_dir (Path): path of the folder containing the files to filter228 out_dir (Path): path of the folder where the filtered files are stored229 '''230 # Standard library imports231 import os232 from pathlib import Path233 from string import Template234 235 # 3rd party imports236 import pandas as pd237 238 # Local imports239 from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_OUTDIR_PARSING240 241 t = Template('$colname in @tokeep') # Template for the query242 for file in DIC_OUTDIR_PARSING.values():243 df = pd.read_csv(in_dir / Path(file), sep='\t')244 query = t.substitute({'colname':df.columns[0]}) 245 df.query(query).to_csv(out_dir / Path(file), 246 index=False,247 sep="\t")248 249def item_filter_modification(item,item_values, filters_filename) :250 '''251 Modification of items values list in the json file of the filtering configuration252 for corpus filtering 253 254 Args: 255 item (str): item accronyme256 item_values (list): list of item values to be put in the json file 257 filters_filename (path): path of the json file 258 259 '''260 261 # Standard library imports262 import json263 with open(filters_filename, "r") as read_file:264 config_filter = json.load(read_file)265 266 config_filter[item]['list'] = item_values267 268 with open(filters_filename, "w") as write_file:269 jsonString = json.dumps(config_filter, indent=4)270 write_file.write(jsonString)271 272def item_values_list(item_values_file):273 '''274 Builds a list of item values from a file of the same structure 275 as the text files resulting from the corpus description (".dat" extension)276 277 Args: 278 item_values_file (path): path of the dat file that contains the item values (str)279 280 Returns:281 item_values_select (list): list of item values282 '''283 # Standard library imports284 import csv285 286 item_values = []287 with open(item_values_file, newline='') as f:288 reader = csv.reader(f)289 item_values = list(reader)290 291 item_values_select =[]292 for x in range(len(item_values)):293 mystring = str(item_values[x])294 start = 2295 end = mystring.find(',', start) - 1296 value = str(mystring[start:end]).replace("'",'"')297 item_values_select.append(value)298 299 return item_values_select300def filters_modification(config_folder,file_config_filters):301 '''302 Modification of the filter configuration 303 using a selection of item values saved in the file item_values_file (.dat file) 304 of the same structure as item files resulting from corpus description305 306 Args:307 config_folder (path): path of the configuration folder 308 containing the file item_values_file selected interactively309 file_config_filters (path): path of the json filters configuration file310 311 '''312 313 # Standard library imports314 import os315 from pathlib import Path316 317 # Local imports318 from BiblioAnalysis_Utils.BiblioGui import Select_multi_items319 from BiblioAnalysis_Utils.BiblioGui import filter_item_selection320 # Identifying the item to be modified in the filters configuration321 filter_item = filter_item_selection()322 # Setting the folders list for item_values selection list323 folders_list = [x[0] for x in os.walk(config_folder)][1:]324 folders_list = [os.path.split(x)[-1] for x in folders_list]325 folders_list.sort()326 # Selection of the folder of the item_values selection files327 print('Please select the folder of item_values selection file via the tk window')328 myfolder_name = Select_multi_items(folders_list,'single')[0]+'/'329 myfolder = config_folder / Path(myfolder_name)330 # Setting the list of item_values selection files to be put in the filters configuration file331 files_list = os.listdir(myfolder)332 files_list.sort()333 print('\nPlease select the item_values selection file via the tk window')334 myfile = Select_multi_items(files_list,'single')[0]+'/'335 item_values_file = myfolder / Path(myfile)336 item_values_list_select = item_values_list(item_values_file) ...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!