Best Python code snippet using selene_python
scrape_from_url.py
Source:scrape_from_url.py
1# -*- coding: utf-8 -*-2'This module is used to scrape all texts from classified url'3from bs4 import BeautifulSoup, Comment4from fake_useragent import UserAgent5from urllib.request import urlopen6from urllib.error import URLError7from typing import List, Tuple8from nltk import sent_tokenize9from _socket import gaierror10from pathlib import Path11from tqdm import tqdm12import pandas as pd13import urllib14import pickle15import bs416import re17import os18UNIVERSAL_ENCODING = "utf-8"19def tag_visible(element: bs4.element.ResultSet) -> bool:20 """Filter tags in html21 22 Return False for those invisible contents' tag.23 Return True for visible contents' tag.24 Args:25 element: an bs4.element instance waiting to be filtered.26 Returns:27 False for invisible contents' tag.28 True for visible contents' tag.29 """30 if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:31 return False32 if isinstance(element, Comment):33 return False34 return True35def crawl(URL: List[str]) -> Tuple[List[str], List[str]]:36 """Crawl corpus from classified URLs37 Args:38 URL: a list of URL string waiting to be scraped.39 Returns:40 contents: a list of string contents scrapepd from given URLs.41 valid_URL: a list of URL have been scraped and reserve for the convenience of a side-by-side annotating.42 """43 valid_URL = []44 contents = []45 for index, url in enumerate(tqdm(URL)):46 request = urllib.request.Request(url, headers={'User-Agent': UserAgent().random})47 try:48 html = urlopen(request, timeout=10).read().decode('utf-8')49 except gaierror as e:50 print(index, e, url)51 continue52 except URLError as e:53 print(index, e, url)54 continue55 except:56 print("Something else went wrong with", url, "\n")57 soup = BeautifulSoup(html, features='lxml')58 texts = soup.findAll(text=True)59 # Format and clean corpus.60 visible_texts = filter(tag_visible, texts)61 visible_texts = "".join(text for text in visible_texts)62 visible_texts = re.sub(r"(\r)+", "\r", visible_texts)63 visible_texts = re.sub(r"(\n)+", "\n", visible_texts)64 visible_texts = re.sub(r"(\r\n)+", "\n", visible_texts)65 visible_texts = re.sub(r"(\r)+", "\r", visible_texts)66 visible_texts = re.sub(r"(\n)+", "\n", visible_texts)67 visible_texts = re.sub(r"\n(\s)+", "\n", visible_texts)68 visible_texts = re.sub(r"\s\n(\s)*", "\n", visible_texts)69 visible_texts = re.sub(r"\n(\W)+\n", "\n", visible_texts)70 visible_texts = re.sub(r"^(\s)+", "", visible_texts)71 visible_texts = re.sub(r"(\s)+$", "", visible_texts)72 visible_texts = re.sub(r"\. ", ".\n", visible_texts)73 visible_texts = re.sub(r"\w(\. )\w", ".\n", visible_texts)74 sentences = sent_tokenize(visible_texts)75 visible_texts = "\n".join(sentence for sentence in sentences)76 if visible_texts:77 valid_URL.append(url)78 contents.append(visible_texts)79 del visible_texts80 assert len(contents) == len(valid_URL)81 return contents, valid_URL82 83if __name__ == '__main__':84 corpus_folder = Path("../Data/Corpus2/")85 url_folder = Path("../Course_Collected/")86 websites = pd.read_csv(url_folder / "Final.csv")87 contents, scraped_URL = crawl(websites.URL)88 pickle.dump(contents, open(corpus_folder / "content.p", "wb"))89 pickle.dump(scraped_URL, open(corpus_folder / "url.p", "wb"))90 contents = pickle.load(open(corpus_folder / "content.p", "rb"))91 scraped_URL = pickle.load(open(corpus_folder / "url.p", "rb"))92 93 # Save corpus to file94 corpus_index = 095 for content in tqdm(contents):96 file_name = str(corpus_index) + ".txt"97 full_file_path = corpus_folder / file_name98 with open(full_file_path, "w", encoding=UNIVERSAL_ENCODING) as file:99 file.write(content)100 corpus_index += 1101 # Save url list102 url_file_name = "url.txt"103 url_full_file_path = corpus_folder / url_file_name104 file = open(url_full_file_path, "w", encoding=UNIVERSAL_ENCODING)105 for key, value in tqdm(enumerate(scraped_URL)):106 if key != len(scraped_URL) - 1:107 file.write(value + '\n')108 else:109 file.write(value)...
webscrap1.py
Source:webscrap1.py
1import bs4 as bs2from bs4.element import Comment3import urllib.request4import re5import string6from nltk.tokenize import word_tokenize7from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS8#from nltk.stem import WordNetLemmatizer9from heapq import nlargest10import ssl11from time import time12import userAgents13def tag_visible(element):14 if element.parent.name in ['style', 'script', 'head', '[document]']:15 return False16 if isinstance(element, Comment):17 return False18 return True19def text_from_html(body):20 print("in text_from_html")21 soup = bs.BeautifulSoup(body, 'html.parser')22 texts = soup.findAll(text=True)23 visible_texts = filter(tag_visible, texts)24 #print(texts)25 return visible_texts26 #return u" ".join(t.strip() for t in visible_texts)27ctx = ssl.create_default_context()28ctx.check_hostname = False29ctx.verify_mode = ssl.CERT_NONE30ourStopWords = set({"click","view","more","link","forgot","password"})31 32print("imported")33#startTime = time()34def getSiteKeywords(url):35 html=''36 count=037 while(True):38 try:39 req = urllib.request.Request(url,data=None,headers={'User-Agent': userAgents.getRandomUserAgent()})40 html = urllib.request.urlopen(req, context=ctx).read()41 #html = urllib.request.urlopen(url, context = ctx).read()42 break43 #sauce = urllib.request.urlopen(url).read()44 #soup = bs.BeautifulSoup(sauce,'lxml')45 except Exception as e:46 print(e,url)47 if(count==2):48 return []49 count+=150 #print(soup.get_text())51 #print(soup)52 #return None53 #tags = ['a','h1','h2','h3','meta','title','p','div']54 #keywords = []55 #for i in tags :56 #keywords += list(soup.find_all(i))57 58 d = dict()59 #lemmatizer=WordNetLemmatizer()60 removeSpcCharPattern=re.compile('[\W_]+')61 #print(type(html))62 #count=063 for data in text_from_html(html):64 if(len(data)<=2):65 continue66 #print(data,len(data))67 text = data.lower().strip()68 text= removeSpcCharPattern.sub(' ',text)69 text = text.strip()70 text = re.sub(r'\d+', '', text)71 text = re.sub(r"[^\w\s]","",text)72 #text = text.translate(string.maketrans("","", string.punctuation))73 #print(text)74 tokens = word_tokenize(text)75 for i in tokens:76 if not i in ENGLISH_STOP_WORDS | ourStopWords:77 #result = lemmatizer.lemmatize(i)78 result = i79 if(len(result)<=2):80 continue81 if result not in d:82 d[result] = 183 else:84 d[result] += 185 #if(count==20):86 #break87 #count+=188 89 largest50 = nlargest(min(50,len(d)),d, key = d.get)90 return largest5091#print(getSiteKeywords("https://www.itlearn360.com"))92#endTime = time()93#print(endTime - startTime)94"""95from bs4 import BeautifulSoup96from bs4.element import Comment97import urllib.request98import ssl99print("imported")100def tag_visible(element):101 if element.parent.name in ['style', 'script', 'head', '[document]']:102 return False103 if isinstance(element, Comment):104 return False105 return True106def text_from_html(body):107 print("in text_from_html")108 soup = BeautifulSoup(body, 'html.parser')109 texts = soup.findAll(text=True)110 visible_texts = filter(tag_visible, texts)111 #print(texts)112 return u" ".join(t.strip() for t in visible_texts)113url = "https://expired.badssl.com"114#req = urllib.request(url)115#gcontext = ssl.SSLContext() # Only for gangstars116print("no error")117ctx = ssl.create_default_context()118ctx.check_hostname = False119ctx.verify_mode = ssl.CERT_NONE120html = urllib.request.urlopen(url, context = ctx).read()121#html = urllib.urlopen(req, context=gcontext).read()122print(text_from_html(html))...
download10000files.py
Source:download10000files.py
1import mechanize2import cookielib3import time4import os5import json6from sets import Set7import lxml.html8import codecs9import re10from bs4 import BeautifulSoup11import textwrap12# with codecs.open(file name,'r',encoding='utf8') as f:13# text = f.read()14# # process Unicode text15def strStr(haystack, needle):16 if len(haystack) < len(needle): return -117 i = 018 while i < len(haystack)-len(needle)+1:19 j = 0; k = i20 while j < len(needle):21 if haystack[k] == needle[j]:22 j+=1; k+=123 else:24 break25 if j == len(needle):26 break27 else:28 i+=129 if i == len(haystack)-len(needle)+1:30 return -131 else:32 return i33def remove(visible_texts, needle):34 buffer = ''35 index = -136 index = strStr(visible_texts, needle)37 if index != -1:38 buffer += visible_texts[:index]39 buffer += visible_texts[index + len(needle):]40 else:41 buffer = visible_texts42 return buffer43def visible(element):44 if element.parent.name in ['style', 'script', '[document]', 'head', 'title', 'link', 'a']:45 return False46 return True47def filteralpha(str):48 str = re.sub(r'([^\s\w]|_)+', '', str)49 return " ".join(str.split(' '))50path = os.path.abspath("/Users/Xiaomin/testproject/tutorial/uniquename.txt")51urlfile=open(path, 'r')52listOfUrl = urlfile.read().split('\n')53print(len(listOfUrl))54i = 965955from sys import path56c = os.getcwd()57os.chdir('/Users/Xiaomin/cs410hw2')58for url in listOfUrl:59 cj = cookielib.LWPCookieJar()60 br = mechanize.Browser()61 br.set_cookiejar(cj)62 br.set_handle_equiv(True)63 br.set_handle_gzip(True)64 br.set_handle_redirect(True)65 br.set_handle_referer(True)66 br.set_handle_robots(False)67 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)68 br.set_debug_http(True)69 br.set_debug_redirects(True)70 br.set_debug_responses(True)71 br.open(url)72 st = br.response().read()73 #filename = url['name'].split('?q=')[1].split('&btn')[0]74 t = lxml.html.parse(url)75 title = t.find(".//title").text76 file = open('xxu46_'+ str(i) + '.html', 'wb')77 file.write(st)78 file.close()79 #nonjs = re.subn(r'<(script).*? </\1>(?s)', '', str(st))[0]80 soup = BeautifulSoup(st)81 texts = soup.findAll(text=True)82 visible_texts = filter(visible, texts)83 visible_texts = ''.join(visible_texts)84 needle = '[if IE]><link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/site/consumer-navbar-ie-470687728._CB379390980_.css"><![endif]'85 visible_texts = remove(visible_texts, needle)86 needle = '<br>'87 visible_texts = remove(visible_texts, needle)88 needle = '<a href="/register/sharing">enable Facebook sharing</a>'89 visible_texts = remove(visible_texts, needle)90 visible_texts = filteralpha(visible_texts)91 with codecs.open('xxu46_' + str(i) + '.txt', 'w', encoding='utf8') as txt:92 txt.write(url+'\n')93 txt.write(title+'\n')94 txt.write(''.join(visible_texts))95 txt.close()96 i += 197#98# import urllib99# >>> html = urllib.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html').read()100#101# >>>102# >>> soup = BeautifulSoup(html)103# >>> texts = soup.findAll(text=True)104# >>>105# >>> def visible(element):106# ... if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:107# ... return False108# ... elif re.match('<!--.*-->', str(element)):109# ... return False110# ... return True111# ...112# >>> visible_texts = filter(visible, texts)...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!