Best Python code snippet using autotest_python
check_gazette_filenames.py
Source:check_gazette_filenames.py
1#!/usr/bin/env python2# coding: utf-83# In[ ]:4'''5File: Check Gazette Filenames6------------------------------7Check that gazette filenames match the gazette file stored in the JSON.8Prints errors for manual checking. 9'''10import json11import os12from os import listdir, rename, path13import re14filepath = "/home/dssg-cfa/ke-gazettes/"15# In[ ]:16def is_special_issue(gazette_data):17 '''18 Returns if gazette is special issue (true) or not (false)19 '''20 21 first_page = gazette_data['analyzeResult']['readResults'][0]['lines']22 for line in first_page: 23 if 'SPECIAL ISSUE' in line['text']:24 return True25 26 return False27def get_date(gazette_data):28 '''29 Returns date reflected in gazette data30 '''31 32 first_page = gazette_data['analyzeResult']['readResults'][0]['lines']33 for line in first_page: 34 if 'NAIROBI' in line['text']:35 if not re.search(r', \d{4}', line['text']):36 continue37 38 date = line['text'][9:]39 40 day = ""41 for ch in date[:2]:42 if ch.isdigit():43 day += ch44 45 if len(day) == 1: 46 day = "0" + day47 48 month_start_idx = date.find(" ") + 149 month_end_idx = date.find(",")50 month = date[month_start_idx:month_end_idx]51 52 year = date.strip()[-4:]53 54 return(("dated-" + day + "-" + month + "-" + year).strip().lower())55 56 57 return "" 58def is_dated_correctly(gazette_fn, gazette_data):59 '''60 Returns whether gazette is dated correctly61 If not dated correctly, returns correct date62 '''63 date = get_date(gazette_data)64 date_fn = re.search(r'dated-\d+-[a-z]+-\d{4}', gazette_fn).group()65 66 # filter for common bugs in getting date67 if (date != date_fn) and ("--" not in date) and (" " not in date) and (date[-4:].isdigit):68 return False, date 69 70 return True, ""71def is_numbered_correctly(gazette_fn, gazette_data, just_results = False):72 '''73 Returns false if vol or issue number from gazette_data are not correctly74 reflected in gazette_fn (filename)75 '''76 first_page = gazette_data['analyzeResult']['readResults'][0]['lines']77 txt = None78 for line in first_page: 79 if 'Vol' in line['text']: 80 txt = line['text']81 break82 83 if not txt:84 print("Unable to find string \'Vol\'; check manually")85 return False, ""86 87 idx_vol_start = txt.index("Vol") + 388 89 if idx_vol_start > len(txt):90 print("\'Vol\' string and volume number are on different lines; check manually")91 return False, ""92 93 vol = re.search(r'[A-Z]+', txt[idx_vol_start:])94 95 if not vol: 96 print("Unable to find volume number; check manually")97 return False, ""98 99 vol = vol.group(0).lower()100 101 no = re.search("No", txt)102 if not no: 103 print("Unable to find string \'No\'; check manually")104 return False, ""105 106 no = re.search(r'\d+(A)?', txt[txt.index("No"):])107 if not no: 108 print("Unable to find issue number; check manually")109 return False, ""110 111 no = no.group(0).lower()112 113 if just_results: 114 return vol, no115 116 pre = 'gazette-ke-vol-' + vol + '-no-' + no + "-"117 118 # common bug -- 'h' is misread119 if pre in gazette_fn or (no in gazette_fn and "h" in vol): 120 return True, ""121 122 suffix = re.search("dated-\d+-[a-z]+-\d+(-special)?", gazette_fn).group(0)123 124 new_fn = pre + suffix125 126 return False, new_fn127# In[ ]:128def check_filename(gazette_fn, gazette_data):129 '''130 Parameters: current filename of gazette; data - json output from Read API131 Returns false if 132 - Incorrectly labelled as "special" 133 - Should be labelled as "special" and isn't134 - Dated incorrectly135 - Issue or volume number is incorrect136 137 Renames Gazettes. 138 Currently automatically renames Gazettes incorrectly labelled as -special / not special,139 because this is a common error and the detection for it is reliable. 140 Prompts user to rename the Gazette (Y/N options) if volume, issue, or date is incorrect.141 '''142 named_correctly = True143 144 print("**Checking " + gazette_fn + "**\n")145 146 # if there was an error getting the gazette, prompt to remove the file147 first_page = gazette_data['analyzeResult']['readResults'][0]['lines']148 if len(first_page) == 0:149 if ("Error" in gazette_data['analyzeResult']['readResults'][1]['lines'][0]['text']):150 print("Error almost definitely encountered")151 print("Empty first page and error message on second page.")152 else:153 print("Empty first page; likely error in gazette. Check in database.")154 155 confirm = input("Do you want to remove this file? (Y/N): ")156 if "Y" not in confirm and "y" not in confirm:157 print("Keeping file " + gazette_fn)158 return True159 else: 160 print("Removing file " + gazette_fn)161 os.remove(filepath + gazette_fn)162 print("Done.\n")163 return False164 165 # CHECK THAT FILENAME MATCHES GAZETTE CONTENT and prompt to rename166 if "-special" in gazette_fn and not is_special_issue(gazette_data):167 print("Gazette incorrectly labelled as special.")168 print("Calling \'rename_gazette\' with flag = \'from_special\'\n")169 renamed, new_fn = rename_gazette(gazette_fn, flag = "from_special")170 if renamed:171 gazette_fn = new_fn172 named_correctly = False173 174 if not "-special" in gazette_fn and is_special_issue(gazette_data):175 print("Gazette is a special issue and not labelled as such.")176 print("Calling \'rename_gazette\' with flag = \'to_special\'\n")177 renamed, new_fn = rename_gazette(gazette_fn, flag = "to_special")178 if renamed:179 gazette_fn = new_fn180 named_correctly = False181 182 date_correct, real_date = is_dated_correctly(gazette_fn, gazette_data)183 if not date_correct and len(real_date) > 0:184 print("Gazette is dated incorrectly.")185 print("Calling \'rename_gazette\' with flag = \'dated\' and dated_str = " + real_date + "\n")186 renamed, new_fn = rename_gazette(gazette_fn, flag = "dated", dated_str = real_date)187 if renamed:188 gazette_fn = new_fn189 named_correctly = False190 191 number_correct, new_fn = is_numbered_correctly(gazette_fn, gazette_data)192 if not number_correct and len(new_fn) > 0:193 print("Gazette volume/issue numbering is incorrect.")194 print("Calling \'rename_gazette\' with flag \'pre\' and fn_with_pre = " + new_fn + "\n")195 renamed, new_fn = rename_gazette(gazette_fn, flag = "pre", fn_with_pre = new_fn)196 if renamed:197 gazette_fn = new_fn198 named_correctly = False199 200 print("-------------")201 return named_correctly202# In[ ]:203def rename_gazette(gazette_fn, flag, dated_str = "", fn_with_pre = ""):204 '''205 ONLY CALL THIS FUNCTION IF YOU ARE SURE YOU WANT TO RENAME SOMETHING 206 Rename gazette at gazette_fn according to flag: 207 to_special: add "-special" at end208 from_special: remove "-special" from end209 dated: change the date in the file name to `dated`210 pre: change issue/volume number prefix211 '''212 213 if flag == "to_special": 214 new_fn = gazette_fn.strip() + "-special"215 216 elif flag == "from_special":217 new_fn = gazette_fn.strip()[0:-8]218 219 # a bit more complicated --220 # need to ensure preservation of '-special' & of vol/issue numbers, if applicable221 elif flag == "dated": 222 new_fn = dated_str223 if "-special" in gazette_fn:224 new_fn += "-special"225 pre = re.search(r'gazette-ke-vol-[a-zA-Z]+-no-\d{1,4}(a)?-', gazette_fn).group(0)226 if pre: 227 new_fn = pre + new_fn228 229 elif flag == "pre": 230 new_fn = fn_with_pre231 232 else: 233 raise Exception("Invalid flag arg: should be to_special, from_special, dated, or pre")234 235 if path.exists(filepath + new_fn):236 print("Error: gazette already exists. Not renaming.")237 print("New filename (attempted): " + new_fn)238 print("Current filename: " + gazette_fn + "\n")239 # Recommend uncommenting this only once all files are correctly named 240 # os.remove(filepath + gazette_fn)241 return False, ""242 243 print("Renaming " + gazette_fn + " to " + new_fn)244 245 # Prompt user to confirm before changing date or volume/issue numbering246 # The to/from special code is quite accurate, but *change this if you have concerns.*247 if "special" in flag: 248 confirm = "Y"249 else: 250 confirm = input("Are you sure you want to rename? (Y/N): ")251 if "Y" not in confirm and "y" not in confirm:252 print("Not renaming file " + gazette_fn + " to " + new_fn + ".\nStopping...\n")253 return False, ""254 255 rename(filepath + gazette_fn, filepath + new_fn)256 print("Done.\n")257 return True, new_fn258# In[ ]:259def check_all_filenames(yr_start = 0, yr_end = 0):260 '''261 Check all filenames against gazette data. 262 Option to filter by year. 263 264 Calls `rename_gazette`, which prompts user to rename if they would like to. (Y/N)265 NOTE: Currently `check_filename` is implemented to auto-rename Gazettes that are266 incorrectly labelled as special / not special issues. 267 Change this in `rename_gazette` if you have concerns. 268 '''269 if (yr_start == 0 and yr_end > 0) or (yr_start > 0 and yr_end == 0):270 print("Error: must enter both yr_start and yr_end")271 return272 273 274 # path where gazettes are stored275 path = "/home/dssg-cfa/ke-gazettes/"276 277 fn_list = [f for f in listdir(path)]278 279 # filter by year 280 if yr_start > 0: 281 yr_list = [str(y) for y in range(yr_start, yr_end)]282 fn_list = list(filter(lambda f: re.search(r'\d{4}', f).group(0) in yr_list, fn_list))283 284 for gazette_fn in fn_list: 285 with open(path + gazette_fn) as f: 286 gazette_data = json.load(f)287 ...
create_db_mapping.py
Source:create_db_mapping.py
1#!/usr/bin/env python2# coding: utf-83# In[ ]:4import json5import requests 6import os7from helpers import check_gazette_filenames as cf 8from helpers import write_urls as wu9from helpers import dest_fn_from_url as df10FOLDER = "/home/dssg-cfa/ke-gazettes-first-pgs/"11FOLDER_CURR = "/home/dssg-cfa/ke-gazettes/"12# In[ ]:13'''14The final data structure will have the form: 15- Key: name in our database16- Value: a dictionary with17--- src_database: source database(s) (list)18--- names_in_db (list)19--- checksums: (if connected africa) -- a list, but should just be one of these20--- docids: (if connected africa) document id (unique to the document) 21--- docnums: (if gazeti) document number 22'''23# In[ ]:24# create a mapping from hash (checksums) and name to doc ID 25def get_to_id(): 26 data_json = wu.conn_afr_api_call()27 hash_and_name_to_id = {}28 for result in data_json['results']: 29 checksums = result['checksums'][0]30 name = result['name']31 if (checksums, name) not in hash_and_name_to_id:32 hash_and_name_to_id[(checksums, name)] = []33 hash_and_name_to_id[(checksums, name)].append(result['id'])34 35 return hash_and_name_to_id36def info_to_std_format(vol, issue, date, special): 37 if vol.isdigit():38 vol = df.num2roman(int(vol))39 name = "gazette-ke-vol-" + vol.lower() + "-no-" + issue + "-" + date40 if special:41 name += "-special"42 return name.lower()43def fn_to_std_format(fn):44 checksum = fn[fn.rfind("_") + 1:].replace("*", "")45 special = "special" in fn46 fn_trimmed = fn.replace("-special", "")47 vol = fn_trimmed[fn_trimmed.find("vol-") + 4:fn_trimmed.find("-no")]48 no = fn_trimmed[fn_trimmed.find("no-") + 3:fn_trimmed.find("-dated")]49 dated = fn_trimmed[fn_trimmed.find("dated-"):fn_trimmed.find("_")]50 return info_to_std_format(vol, no, dated, special)51def get_true_fn(gazette_data): 52 vol, issue = cf.is_numbered_correctly("", gazette_data, just_results = True)53 if not vol or not issue:54 return "invalid_fn_placeholder"55 date = cf.get_date(gazette_data)56 special = cf.is_special_issue(gazette_data)57 return info_to_std_format(vol, issue, date, special)58def get_info_gazeti(fn, new_fn, gazette_data, fn_mapping): 59 '''60 Given: filepath to first page JSON61 Returns: 62 (1) new filename (directly from Gazette, in our format)63 (2) dictionary with appropriate information64 '''65 fn = fn.replace(FOLDER, "")66 if new_fn in fn_mapping: 67 to_src = fn_mapping[new_fn]68 if "docnums" not in to_src:69 to_src["docnums"] = []70 else: 71 to_src = {"src_database": [], "names_in_db": [], "docnums": []}72 73 if "gazeti" not in to_src["src_database"]:74 to_src["src_database"].append("gazeti")75 76 src_name = fn[0:fn.find("_")]77 if src_name not in to_src["names_in_db"]:78 to_src["names_in_db"].append(src_name)79 80 num = fn[fn.rfind("-") + 1:]81 to_src["docnums"].append(num)82 83 return to_src84 85def get_info_conn_af(fn, new_fn, gazette_data, fn_mapping, hash_and_name_to_id): 86 '''87 Given: filepath to first page JSON88 Returns: 89 (1) new filename (directly from Gazette, in our format)90 (2) dictionary with appropriate information91 '''92 fn = fn.replace(FOLDER, "")93 if new_fn in fn_mapping: 94 to_src = fn_mapping[new_fn]95 if "checksums" not in to_src: 96 to_src["checksums"] = []97 to_src["docids"] = []98 else: 99 to_src = {"src_database": [], "names_in_db": [], "checksums": [], "docids": []}100 101 if "connected-africa" not in to_src["src_database"]:102 to_src["src_database"].append("connected-africa")103 104 src_name = fn[0:fn.find("_")]105 if src_name not in to_src["names_in_db"]:106 to_src["names_in_db"].append(src_name)107 108 checksum = fn[fn.rfind("_") + 1:].replace("*", "")109 if checksum not in to_src["checksums"]:110 to_src["checksums"].append(checksum)111 112 id_list = hash_and_name_to_id[(checksum, src_name)]113 for docid in id_list:114 if docid not in to_src['docids']:115 to_src['docids'].append(docid)116 117 return to_src118def get_info():119 fn_mapping = {}120 fn_mapping["empty_files"] = []121 failures = []122 hash_and_name_to_id = get_to_id()123 fns = [f for f in os.listdir(FOLDER)]124 fns = [FOLDER + f for f in fns]125 curr_fns = [f for f in os.listdir(FOLDER_CURR)]126 127 count = 0128 129 for fn in fns: 130 with open(fn) as f: 131 gazette_data = json.load(f)132 133 first_page = gazette_data['analyzeResult']['readResults'][0]['lines']134 if len(first_page) == 0:135 fn_mapping["empty_files"].append(fn.replace(FOLDER, ""))136 continue137 138 new_fn = get_true_fn(gazette_data)139 140 if new_fn not in curr_fns:141 failures.append(fn.replace(FOLDER, ""))142 continue143 144 if "connected-africa" in fn: 145 to_src = get_info_conn_af(fn, new_fn, gazette_data, fn_mapping, hash_and_name_to_id)146 elif "gazeti" in fn: 147 to_src = get_info_gazeti(fn, new_fn, gazette_data, fn_mapping)148 else:149 print("invalid filename for " + fn + "\n")150 continue151 152 fn_mapping[new_fn] = to_src153 154 failures = loop_failures(failures, fn_mapping, curr_fns, hash_and_name_to_id)155 fn_mapping['failed_to_map_from_cfa_db'] = failures156 157 print("failed on " + str(len(failures)))158 return fn_mapping159def loop_failures(failures, fn_mapping, curr_fns, hash_and_name_to_id):160 161 new_failed = {}162 163 for fn in failures:164 new_fn = fn_to_std_format(fn)165 166 with open(FOLDER + fn) as f:167 gazette_data = json.load(f)168 169 if cf.is_special_issue(gazette_data): 170 if "-special" not in new_fn: 171 new_fn += "-special" 172 else: 173 if "-special" in new_fn:174 new_fn = new_fn.replace("-special", "")175 176 if new_fn in curr_fns: 177 if "connected-africa" in fn: 178 to_src = get_info_conn_af(fn, new_fn, gazette_data, fn_mapping, hash_and_name_to_id) 179 elif "gazeti" in fn:180 to_src = get_info_gazeti(fn, new_fn, gazette_data, fn_mapping)181 fn_mapping[new_fn] = to_src182 else:183 new_failed[fn] = new_fn184 ...
rename.py
Source:rename.py
1import os2for fn in os.listdir():3 if '.wiki' in fn:4 new_fn = fn[:]5 new_fn = new_fn.replace('.wiki', '')6 new_fn = new_fn.replace('.', '')7 new_fn = new_fn + '.wiki'...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!