Best Python code snippet using lisa_python
athens_scrapping.py
Source:athens_scrapping.py
1import requests, re, pandas as pd2from sqlalchemy import create_engine3from dateparser.search import search_dates4from bs4 import BeautifulSoup5import camelot, pandas as pd6import numpy as np7from pandas import *8from scraping.models import *9def html_parser(url):10 '''Gets a URL to return the html parsed code to be used on a soup variable'''11 try:12 page = requests.get(url)13 except requests.exceptions.Timeout:14 #TODO: try again in 30 minutes (add trigger)15 pass16 except requests.exceptions.HTTPError as err:17 raise SystemExit(err)18 soup = BeautifulSoup(page.content, "html.parser")19 #returns HTML code of the website20 return soup21def get_pdf_list(parsed_url):22 '''Gets list of pdfs of the last 5 days of pdfs from parsed html code'''23 soup = html_parser(parsed_url)24 #Get list of URLs including pdf files25 pdf_docs = soup.find_all('a', {'title': re.compile(r'.*\.pdf')})26 #Get list of titles and urls. NOTE: ENTER THE NUMBER OF DOCS27 try:28 pdf_json = []29 for pdf_doc in pdf_docs[:3]:30 pdf_title = pdf_doc.find(text=True, recursive = False).strip()31 pdf_url = parsed_url+pdf_doc.get('href').strip()32 onhold_date = search_dates(pdf_title, languages=['el'])[0][1]33 #Add to json item34 item = {"pdf_title": pdf_title, "pdf_url": pdf_url, "onhold_date":onhold_date}35 pdf_json.append(item)36 except:37 print(f"Error s101 {pdf_title}")38 raise Exception(f"Couldn't build json file with urls, title: {pdf_title} and onhold date")39 print("Page parsed and json has been built with doc title, url and onhold date")40 return pdf_json41'''TODO: Change to be secure'''42def pass_to_database(username, password, server, port, database, dataframe, table):43 engine = create_engine(f'postgresql://{username}:{password}@{server}:{port}/{database}', encoding='utf-8-sig')44 dataframe.to_sql(table, engine, if_exists='replace')45def runcsript():46 #Define the URL that you want to scrape:47 athens_hospitals_url = 'https://www.moh.gov.gr/articles/citizen/efhmeries-nosokomeiwn/68-efhmeries-nosokomeiwn-attikhs'48 pdf_json_info = get_pdf_list(athens_hospitals_url) #json data of hospitals49 '''Start of cleanup process50 for i, json_item in enumerate(pdf_json_info):51 if "ÎΡÎÎ ÎÎ ÎÎÎÎÎÎÎÎÎ ÎÎÎΣÎ" in json_item['pdf_title'] and json_item['onhold_date'] == pdf_json_info[i+1]['onhold_date']:52 pdf_json_info.pop(i+1)53 '''54 tables_received = []55 for pdf_item in pdf_json_info:56 pdf_url = pdf_item["pdf_url"]57 pdf_title = pdf_item["pdf_title"]58 onhold_date = pdf_item["onhold_date"]59 print(f"Getting thought the item with title: {pdf_title}")60 try:61 #Read pdf62 tables = camelot.read_pdf(pdf_url, pages ='1-end')63 number_of_tables = tables.n64 num_of_columns = len(tables[0].df.columns)65 print(f"num of tables: {number_of_tables}, num of columns: {num_of_columns}")66 except:67 print(f"Error s102 {pdf_title}")68 raise Exception(f"Couldn't parse the pdf file with title: {pdf_title}")69 '''Process to concat tables'''70 try:71 all_tables = []72 for i in range(number_of_tables):73 table = tables[i].df74 all_tables.append(table)75 concat_pdf_tables = pd.concat(all_tables, axis=0, ignore_index=True)76 concat_pdf_tables.rename(columns={concat_pdf_tables.columns[0]: "clinic" }, inplace = True) #Set first column to Clinic77 start_new_table_from = concat_pdf_tables.loc[concat_pdf_tables['clinic'].str.contains("ÎÎÎÎÎÎÎΣ|ÎλινικÎÏ", case=False)].first_valid_index() #Returns the first id 78 concat_pdf_tables = concat_pdf_tables.iloc[start_new_table_from:].reset_index(drop=True)79 new_header = concat_pdf_tables.iloc[0] #grab the first row for the header80 concat_pdf_tables = concat_pdf_tables[1:] #take the data less the header row81 concat_pdf_tables.columns = new_header #set the header row as the df header82 concat_pdf_tables.reset_index(drop=True)83 concat_pdf_tables.rename(columns={concat_pdf_tables.columns[0]: "clinic" }, inplace = True) #Set first column to Clinic84 search = concat_pdf_tables.loc[concat_pdf_tables['clinic'].str.contains("ÎÎÎÎÎÎÎΣ|ÎλινικÎÏ", case=False)] #Find and remove all the clinic rows85 final_results = concat_pdf_tables.drop(search.index.values).reset_index(drop=True)86 df_unpivoted = final_results.melt(id_vars=['clinic', 'Î ÎΡÎΤÎΡÎΣÎÎΣ'], var_name='onhold_time', value_name='hospital_names')87 df_unpivoted['hospital_names'].replace('', np.nan, inplace=True)88 df_unpivoted.dropna(subset=['hospital_names'], inplace=True)89 df_unpivoted = df_unpivoted.reset_index(drop=True)90 cleanup_process = df_unpivoted.rename(columns={"Clinic":"clinic", "Î ÎΡÎΤÎΡÎΣÎÎΣ":"note", "onhold_time":"onhold_hour", "hospital_names":"hospital_name"})91 cleanup_process.head()92 #cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'Î ÎÎΡÎÎÎΣ\n', 'Î ÎÎΡÎÎÎΣ ', regex=True) #remove this if needed93 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nÎ ÎÎΡÎÎÎΣ \n', ', Î ÎÎΡÎÎÎΣ ', regex=True)94 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nÎ ÎÎΡÎÎÎΣ\n', ' Î ÎÎΡÎÎÎΣ ', regex=True)95 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'Î ÎÎΡÎÎÎΣ \n', 'Î ÎÎΡÎÎÎΣ ', regex=True)96 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'Î. \nÎÎÎÎÎÎÎΤÎΣ', 'Î. ÎÎÎÎÎÎÎΤÎΣ', regex=True)97 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'Î \nÎ ÎÎÎÎÎÎΡÎΣΤÎΣ', 'Î Î ÎÎÎÎÎÎΡÎΣΤÎΣ', regex=True)98 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'ÎÎ. \nÎ ÎÎΤÎÎÎÎÎΩÎ', 'ÎÎ. Î ÎÎΤÎÎÎÎÎΩÎ', regex=True)99 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'ÎÎÎÎÎ \nÎÎÎΡÎΥΡÎÎ', 'ÎÎÎÎÎ ÎÎÎΡÎΥΡÎÎ', regex=True)100 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'ÎÎÎÎΣ \nΣÎÎÎÎΣ', 'ÎÎÎÎΣ ΣÎÎÎÎΣ', regex=True)101 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nÎ.', ', Î.', regex=True)102 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nÎ .', ', Î .', regex=True)103 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nÎ.', ', Î.', regex=True)104 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nΨ.', ', Ψ.', regex=True)105 cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n', ' ', regex=True)106 separate_data = cleanup_process['hospital_name'].str.split(',').apply(Series, 1).stack()107 separate_data.index = separate_data.index.droplevel(-1)108 separate_data.name = 'hospital_name'109 del cleanup_process['hospital_name']110 final_data = cleanup_process.join(separate_data)111 final_data['onhold_date'] = onhold_date112 final_data['region'] = "Athens"113 final_data['hospital_name'] = final_data['hospital_name'].str.strip()114 final_data = final_data[final_data.hospital_name != 'Î ÎÎΡÎÎÎΣ']115 final_data = final_data.reset_index(drop=True)116 tables_received.append(final_data)117 print(f"Table for day: {onhold_date} and pdf {pdf_title} was added")118 except:119 print(f"Error s103 {table}")120 raise Exception(f"Couldn't build the table with na: {table}")121 df_merge = pd.concat(tables_received)122 df_merge.reset_index(drop=True)123 # Not able to iterate directly over the DataFrame124 df_records = df_merge.to_dict('region')125 return df_records126if __name__ == '__main__':127 # test1.py executed as script128 # do something...
main.py
Source:main.py
1from subprocess import Popen, PIPE2import atexit3import os4agent_processes = [None, None]5def cleanup_process():6 global agent_processes7 for proc in agent_processes:8 if proc is not None:9 proc.kill()10def js_agent(observation, configuration):11 """12 a wrapper around a js agent13 """14 global agent_processes15 agent_process = agent_processes[observation.player]16 ### Do not edit ###17 if agent_process is None:18 cwd = os.path.dirname(configuration["__raw_path__"])19 agent_process = Popen(["node", "dist/main.js"], stdin=PIPE, stdout=PIPE, cwd=cwd)...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!