Best Python code snippet using unittest-xml-reporting_python
scrapy_b.py
Source:scrapy_b.py
...33 return soup343536# è·åçªå§çé¾æ¥list37def getFirstContent(soup):38 # print(content)39 # soup = BeautifulSoup(content, "html.parser")40 # æç´¢ç页é¢åºæ¥å¾å°è§é¢é¨åä¿¡æ¯41 next_urls = []42 infos = soup.find_all('a','bangumi-title')43 for info in infos:44 next_urls.append(info['href'].strip())45 # print(len(infos))4647 return next_urls484950# è·åçªå§çä¸äºä¿¡æ¯51def getDetail(path,fname_detail):52 links_ = pd.read_csv(path)53 links = links_.drop_duplicates() # å¯è½æéå¤çéè¦å»é54 urls = links['links']55 cont_id = 056 print("start!")57 v_ids = [] # id58 titles = [] # æ é¢59 genres = [] # ç±»å60 years = [] # 年份61 long_comms = [] # é¿è¯è®ºæ°62 short_comms = [] # çè¯è®ºæ°63 detail_link = [] # å½å页é¢é¾æ¥64 for url2 in tqdm(urls):65 try:66 soup1 = get_soup(r'http:' + url2)67 next_link = soup1.find('a', 'media-title')['href']68 soup2 = get_soup(r'http:' + next_link + r'#long') # é¿è¯é¡µé¢6970 '''71 soup2.find('div', 'media-tab-nav').find('ul').find_all('li'):72 [<li class="">ä½å详æ
</li>,73 <li class="on">é¿è¯ ( 572 )</li>,74 <li class="">çè¯ ( 117867 )</li>,75 <li class="">ç¸å
³è§é¢</li>]76 '''77 # è¯åæ°ï¼ 'é¿è¯ ( 572 )' åæ°å572,å为int,没æè¯è®ºçä¿¡æ¯çä¸éè¦ï¼è¿è¡è·³è¿78 long = int(soup2.find('div', 'media-tab-nav').find('ul').find_all('li')[1].string[5:-2])79 short = int(soup2.find('div', 'media-tab-nav').find('ul').find_all('li')[2].string[5:-2])80 long_comms.append(long)81 short_comms.append(short)82 # åæ é¢83 title = soup2.find('span', 'media-info-title-t').string84 titles.append(title)85 # åæ ç¾86 tags = ''87 for tag in soup2.find('span', 'media-tags').children:88 tags = tags + str(tag.string) + ',' # tags='漫ç»æ¹,ææ,çè¡,声æ§,'89 genres.append(tags)90 # æªå年份ï¼'2019å¹´4æ7æ¥å¼æ'91 year = soup2.find('div','media-info-time').span.string[:-2]92 years.append(year)9394 # å¢å idç95 v_ids.append(soup1.find('a','av-link').string)96 cont_id += 197 # v_ids.append(cont_id)98 # è·åå½å页é¢é¾æ¥99 detail_link.append(r'http:' + next_link)100101 # soup2.find('div','review-list-wrp type-long').find('ul').contents102 if cont_id % 10 == 0:103 print('å·²ç¬å%dæ¡' % cont_id)104 # æ¯5æ¡åå
¥ä¸æ¬¡ï¼é²æ¢ä¸æ导è´æ°æ®ä¸¢å¤±105 if cont_id % 5 == 0:106 # åå
¥107 Data_detail = {'v_id': v_ids, 'title': titles, 'genres': genres, 'year': years,108 'long_comm': long_comms,109 'short_comm': short_comms, 'detail_link': detail_link}110 wirte2csv(Data_detail, fname_detail)111 # æ¸
空112 v_ids = [] # id113 titles = [] # æ é¢114 genres = [] # ç±»å115 years = [] # 年份116 long_comms = [] # é¿è¯è®ºæ°117 short_comms = [] # çè¯è®ºæ°118 detail_link = [] # å½å页é¢é¾æ¥119 time.sleep(5)120121 except Exception:122 pass123 return124125126# è·åçªå§çç¸å
³æ¨è127def getRecommond(path,fname_detail):128 detail_data = pd.read_csv(path)129 detail_data_ = detail_data.drop_duplicates() # å¯è½æéå¤çéè¦å»é130 urls = detail_data_['detail_link']131 cont_id = 0132 print("start!")133 v_ids = [] # id134 rec_id = [] # æ¨èid135 rec_title = [] # æ¨èåå136 for url2 in tqdm(urls):137 try:138 soup1 = get_soup(url2)139 # å¢å count140 cont_id += 1141142 v_ids.append(detail_data_.loc[cont_id,'v_id'])143 # è·åæ¨èçªå§çtitle144 tmp_title = []145 for title in soup1.find_all('div','slide-item-title'):146 tmp_title.append(title.string)147 rec_title.append(tmp_title)148 # è·åæ¨èçªå§çlink149 rec_links = []150 for l in soup1.find_all('div','slide-item-info'):151 rec_links.append(l.find('a')['href'])152 # è·åæ¨èçªå§çid153 tmp_id = []154 for link in rec_links:155 soup2 = get_soup(r'http:'+link)156 tmp_id.append(soup2.find('a', 'av-link').string)157158 rec_id.append(tmp_id)159160 if cont_id % 10 == 0:161 print('å·²ç¬å%dæ¡' % cont_id)162163 # æ¯5æ¡åå
¥ä¸æ¬¡ï¼é²æ¢ä¸æ导è´æ°æ®ä¸¢å¤±164 if cont_id % 5 == 0:165 # åå
¥166 Data_detail = {'v_id': v_ids, 'rec_id': rec_id,'rec_title':rec_title}167 wirte2csv(Data_detail, fname_detail)168 # æ¸
空169 v_ids = [] # id170 rec_id = [] # æ¨èid171 rec_title = [] # æ¨èåå172173 time.sleep(rand_seconds)174175 except Exception:176 pass177 return178179180181def process_time(rat_time):182 # 2020-05-07 len = 10183 if len(rat_time) == 10:184 return rat_time185 else:186 if len(re.findall(r'^\d+å°æ¶å$', rat_time)):187 return (datetime.datetime.now() - datetime.timedelta(hours=int(rat_time[:-3]))).strftime("%Y-%m-%d")188189 elif len(re.findall(r'^\d+åéå$', rat_time)):190 return (datetime.datetime.now()).strftime("%Y-%m-%d")191192 elif rat_time == 'æ¨å¤©':193 return (datetime.datetime.now()-datetime.timedelta(days=1)).strftime("%Y-%m-%d")194195 elif len(rat_time) == 5: # å¦æ没æ年份196 return str(datetime.datetime.now().year) + '-' + rat_time197198199# æ»å¨è·åè¯è®ºä¿¡æ¯çæ¹æ³200def get_rating(url,page_num):201 # è·åç½é¡µæºä»£ç 202 driver.get(url)203 # driver.get(url + r'#long')204 # page_num = long_page_num205 id_names = []206 ratings = []207 rating_times = []208 # 循ç¯å 次 æ»å¨å 次209 for i in range(page_num):210 # 让æµè§å¨æ§è¡ç®åçjs代ç ï¼document.body.scrollHeightï¼çªå£é«åº¦211 js = "window.scrollTo(0,document.body.scrollHeight)"212 driver.execute_script(js)213 time.sleep(rand_seconds)214 # å
·ä½çç½é¡µæ¯æä¹æ ·çï¼bç«æ¯æ»å¨å°åªéï¼ä¸é¢çé½ä¼å è½½è¿æ¥ï¼å æ¤éåæ»å¨ä¹åçç½é¡µ215 if i == page_num-1:216 # è·å页é¢217 content = driver.page_source218 # æ¾å
¥è§£æ219 soup = BeautifulSoup(content, 'lxml')220 # æ¾å°è¿é¡µid221 for li in soup.find_all('li','clearfix'):222 id_names.append(li.find('div',re.compile('review-author-name')).string.strip())223 rat = len(li.find_all('i', 'icon-star icon-star-light')) # è¯å224 ratings.append(rat)225226 rat_time = li.find('div', 'review-author-time').string227 # 对ç¹æ®æ¶é´åå¤ç228 rat_time_2 = process_time(rat_time)229 rating_times.append(str(rat_time_2))230231 return id_names,ratings,rating_times232233234# è·åratingï¼ç¸å
³ä¿¡æ¯ï¼å¹¶åå
¥csv235def get_rating_data(path):236 detail = pd.read_csv(path)237 # print(min(detail['short_comm']+detail['long_comm'])) # 230;238 # print(detail.columns) # ['v_id', 'title', 'genres', 'year', 'long_comm', 'short_comm','detail_link']239 minn = min(detail['short_comm'] + detail['long_comm'])240 rating_links = detail['detail_link']241 long_num = detail['long_comm']242 short_num = detail['short_comm']243 v_ids = detail['v_id']244 for ind, url in enumerate(tqdm(rating_links)):245 # print(ind,url)246 # if ind< 425:247 # continue248 # ææ¯ä¾åé¿çè¯ä»·249 # print(v_ids[61])250 # å 为é¿è¯è®ºä¼æ¯çè¯è®ºå°ï¼å¯è½ç¨å纯çéè¿æå°æ»è¯è®ºæ°è¿è¡æ¯ä¾è®¡ç®ï¼å¯è½ä¼åºç°é¿è¯æ°ä¸å¤ï¼æç»ä¼æä¸äºæ°æ®ç丢失ï¼ä¸ºæ¤è¿éè¿è¡æå°çæ¯è¾251 lon = min(int((long_num[ind] / (long_num[ind] + short_num[ind])) * minn),long_num[ind])252 sho = minn - lon253254 long_page_num = math.ceil(lon / 20) # ä¸é¡µ20个æ°æ®ï¼çéè¦æ»å¨å 页255 short_page_num = math.ceil(sho / 20) # ä¸é¡µ20个æ°æ®ï¼çéè¦æ»å¨å 页256257 id_l, rat_l,time_l = get_rating(url + r'#long', long_page_num)258 id_s, rat_s,time_s = get_rating(url + r"#short", short_page_num)259 # print(len(id_l))260 # print(len(id_s))261262 # éè¦æä¹åçé¿çè¯ä»·åèªåé
çæ°ç®åå°263 id_total = id_l[0:lon]+id_s[0:sho]264 rat_total = rat_l[0:lon]+rat_s[0:sho]265 rating_time_total = time_l[0:lon]+time_s[0:sho]266 # print(len(id_total))267 # print(len(rat_total))268269 # å°è£
å°DataFrame270 Data_rating = {'user_id_name': id_total,'v_id':[v_ids[ind]]*minn,'rating':rat_total,'rating_time':rating_time_total}271 # print(Data_rating)272 fname_rating = "rating_data.csv"273 wirte2csv(Data_rating, fname_rating)274 return275276277# åå
¥csv278def wirte2csv(Data,fname):279 try:280 if os.path.exists(fname):281 DataFrame = pd.DataFrame(Data)282 DataFrame.to_csv(fname, index=False, sep=',', mode='a', header=False)283 print('追å æåï¼')284 else:285 DataFrame = pd.DataFrame(Data)286 DataFrame.to_csv(fname, index=False, sep=',')287 print('save!')288 except:289 print('fail')290291292if __name__ == '__main__':293 flag1 = 0 # è¦ä¸è¦ç¬åçªå§å表页294 flag2 = 0 # è¦ä¸è¦ç¬åçªå§ä¿¡æ¯295 flag3 = 0 # è¦ä¸è¦ç¬åè¯å296 flag4 = 1 # è¦ä¸è¦ç¬åç¸å
³æ¨è297 if flag1:298 # step1299 for i in tqdm(range(21)):300 # ä»0å¼å§çåå æ¯ï¼å¯¹äºç¬¬ä¸æ¬¡è®¿é®ç页é¢ä¼è¿ç»è®¿é®ä¸¤æ¬¡ï¼å¯¼è´éå¤ç¬åï¼æ以i=0æ¶è·å页é¢ï¼ä½æ¯ä¸å»åå
¥ä¿¡æ¯301 # å§çªé¡µé¢ï¼ä»1-20页302 url = 'https://www.bilibili.com/anime/index/#season_version=-1&area=-1' \303 '&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1' \304 '&style_id=-1&order=3&st=1&sort=0&page='+str(i+1)305 # å·æ°ï¼éè¦ï¼ï¼ï¼å¦åå¯è½ä¼å¯¼è´éå¤ç¬å第ä¸ä¸ªé¡µé¢306 driver.refresh()307 # print(url)308 soup = get_soup(url)309 if i == 0:310 continue311 #driver.find_element_by_class_name('p next-page').click()312 next_urls = getFirstContent(soup)313 print(next_urls)314 # åå
¥csv315 Data_link = {'links': next_urls}316 fname_link = "link_data.csv"317 wirte2csv(Data_link, fname_link)318 print('ç¬å°ç¬¬%d页' % i)319 # æå320 time.sleep(5)321 if flag2:322 # step2323 path = r'D:\Learning\postgraduate\bilibili\scrapy_py\link_data.csv'324 # ç¬åç»è并åå
¥æ°çcsv325 getDetail(path,fname_detail = "video_data.csv")326 if flag3:
...
wikiCrawler.py
Source:wikiCrawler.py
...7from nltk import pos_tag8from mtranslate import translate9remove_words = ["\\n", "\"", "\\t", "[", "]", "â", "â", "·"]10stop_words = set(stopwords.words('english')) 11def getFirstContent(soup):12 p_sentences = list()13 div = soup.find('div', {'class': 'mw-parser-output'})14 if (div is None):15 return ''16 children = div.findChildren(recursive=False)17 for child in children: 18 if (child.name == 'h2' or child.name == 'h3'):19 break20 if (child.name == 'p'):21 if (child.text == "\\n\\n"):22 break23 tags_to_delete = child.findAll('sup')24 if (tags_to_delete is not None):25 for tg in tags_to_delete:26 tg.extract()27 articleText = child.get_text(" ").replace(u'\xa0', u' ')28 articleText = articleText.replace("\\'", "'")29 for word in remove_words:30 if word in articleText:31 articleText = articleText.replace(word, u"")32 # remove the characters between the parentheses and brackets33 articleText = re.sub("[\(\[].*?[\)\]]", "", articleText)34 # remove multi-spaces35 articleText = re.sub(" +", " ", articleText)36 sentences = list(map(str.strip, re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", articleText)))37 for each_sentence in sentences:38 if (len(each_sentence) >= 10):39 p_sentences.append(each_sentence)40 return p_sentences41def getSentences(response, outfile):42 doc = Document(response.text)43 content = Document(doc.content()).summary()44 soup = BeautifulSoup(content, "html.parser")45 delete_tags = ['figure']46 for tag in delete_tags:47 tags_to_delete = soup.findAll(tag)48 if (tags_to_delete is not None):49 for tg in tags_to_delete:50 tg.extract()51 tags_to_delete = soup.findAll('p', text="\\n")52 if (tags_to_delete is not None):53 for tg in tags_to_delete:54 tg.extract()55 tags_to_delete = soup.findAll('p', {"class": "shortdescription"})56 if (tags_to_delete is not None):57 for tg in tags_to_delete:58 tg.extract()59 p_sentences = getFirstContent(soup)60 if (p_sentences == ''):61 return False62 for sen in p_sentences:63 outfile.write(sen + "\n")64 outfile.close()65 return True66def getContentOnWiki(link, rec=True):67 visitedUrlFile = "visited_urls.txt"68 try:69 fileUrls = open(visitedUrlFile, 'r', encoding='utf-8')70 except IOError:71 visitedUrls = []72 else:73 visitedUrls = [url.strip() for url in fileUrls.readlines()]...
train.py
Source:train.py
...10from sklearn.linear_model import LogisticRegression11from sklearn.externals import joblib12import os13import sys14def getFirstContent(dataUrl, modelUrl, modelName):15 training_data = load_files(dataUrl, encoding="utf-8")16 '''17 è¿æ¯å¼å§æåç¹å¾ï¼è¿éçç¹å¾æ¯è¯é¢ç»è®¡ã18 '''19 count_vect = CountVectorizer()20 X_train_counts = count_vect.fit_transform(training_data.data)21 '''22 è¿æ¯å¼å§æåç¹å¾ï¼è¿éçç¹å¾æ¯TFIDFç¹å¾ã23 '''24 tfidf_transformer = TfidfTransformer()25 X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)26 '''27 使ç¨æ´ç´ è´å¶æ¯åç±»,并ååºç®åçé¢æµ28 '''29 mnb_pipeline = PMMLPipeline([("classifier", LogisticRegression())])30 mnb_pipeline.fit(X_train_tfidf, training_data.target)31 32 //ä¿å为pklæ ¼å¼33 joblib.dump(mnb_pipeline, modelUrl + modelName)34 //ä¿å为pmmlæ ¼å¼35 sklearn2pmml(mnb_pipeline, modelUrl + modelName, with_repr = True)36 if (os.path.exists(modelUrl + modelName)):37 return "success"38 else:39 return "fail"40if __name__ == '__main__':41 a = []42 for i in range(1, len(sys.argv)):43 a.append((str(sys.argv[i])))...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!