Best Python code snippet using playwright-python
hiv_org2.py
Source:hiv_org2.py
...9from w3lib.html import remove_tags, remove_tags_with_content10def get_domain(url):11 matched = re.match('^(?:http[s]?://)+[^/]*', url).group(0)12 return matched.split('://')[-1]13def trim_url(url):14 matched = re.match('^(?:http[s]?://)+[^/]*', url).group(0)15 return matched16class OrgWebsite(scrapy.Item):17 link = scrapy.Field()18 domain = scrapy.Field()19 referer = scrapy.Field()20class HIVBootstraper(scrapy.Spider):21 #TODO : Change custom setting when not debugging22 name = 'hiv_bootstraper'23 custom_settings = {24 'ITEM_PIPELINES': {'hiv_scraping.pipelines.HivBootstrapScrapingPipeline': 300},25 'CLOSESPIDER_ITEMCOUNT': 10026 }27 saved_domains = []28 dead_ends = {}29 restricted_sections = []30 def __init__(self, **kw):31 super(HIVBootstraper, self).__init__(**kw)32 # self.start_urls = self.__getattribute__()33 # self.allowed_domains = [get_domain(self.start_urls[0])]34 logging.info('Starting Bootstrap Spider with : %s', ', '.join(self.start_urls))35 def parse(self, response):36 links = LinkExtractor(allow=(), deny=self.allowed_domains + self.saved_domains).extract_links(response)37 for link in links:38 if get_domain(link.url) not in self.saved_domains:39 self.saved_domains.append(get_domain(link.url))40 orgwebsite = OrgWebsite(link=link.url, domain=trim_url(link.url),41 referer=trim_url(response.request.url))42 yield orgwebsite43 next_links = LinkExtractor(allow=self.allowed_domains, deny=self.restricted_sections).extract_links(response)44 if len(links) == 0:45 try:46 self.dead_ends[response.request.url] += 147 except:48 self.dead_ends[response.request.url] = 149 self._update_restrictions()50 else:51 for link in next_links:52 yield scrapy.Request(link.url, callback=self.parse)53 def _update_restrictions(self):54 self.restricted_sections = [k for k in self.dead_ends.keys() if self.dead_ends[k] > 3]55class HIVChecker(scrapy.Spider) :56 name = 'hiv_checker'57 start_urls= []58 custom_settings = {59 'ITEM_PIPELINES': {'hiv_scraping.pipelines.ClfHIVPipeline': 300} #CheckHIVPipeline60 }61 def start_requests(self):62 return [scrapy.Request(dom, callback=self.hiv_check) for dom in self._load_domains_to_check()]63 def hiv_check(self, response): #parse method64 sel = Selector(response = response)65 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()66 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])67 yield {'domain' : trim_url(response.request.url),68 'text_dump' : word_dump}69 def _has_content(self, txt):70 for t in txt :71 if t not in ['\n', '\t', ' ', '\r'] :72 return True73 def _load_domains_to_check(self):74 doms = pd.read_csv('domains.csv')75 doms = doms[doms['to_crawl'].isnull()].sort_values(by='references')['domain'].tolist()76 logging.info("%s new domains to be check for HIV" % str(len(doms)))77 return doms78class HIVSatellite(scrapy.Spider):79 #TODO : update the dead-end mechanism to also check whether the new pages are relevant80 name = 'hiv_satellite'81 custom_settings = { 'ITEM_PIPELINES': {'hiv_scraping.pipelines.HivSatScrapingPipeline': 300},82 'CLOSESPIDER_PAGECOUNT' : 500}83 saved_domains = []84 dead_ends = {}85 restricted_sections = []86 def __init__(self, **kw):87 super(HIVSatellite, self).__init__(**kw)88 self.start_urls, self.allowed_domains = self._get_starting_state()89 if len(self.start_urls)==1 :90 logging.info('New satellite spider : %s', self.start_urls[0])91 def parse(self, response):92 # TODO : Find a way to have the exact same logic as the HIVBootstrap spider (maybe just have the exact same type?)93 links = LinkExtractor(allow=(), deny=self.allowed_domains + self.saved_domains).extract_links(response)94 for link in links:95 if get_domain(link.url) not in self.saved_domains:96 self.saved_domains.append(get_domain(link.url))97 orgwebsite = OrgWebsite(link=link.url, domain=trim_url(link.url),98 referer=trim_url(response.request.url))99 yield orgwebsite100 next_links = LinkExtractor(allow=self.allowed_domains, deny=self.restricted_sections).extract_links(response)101 if len(links) == 0:102 try:103 self.dead_ends[response.request.url] += 1104 except:105 self.dead_ends[response.request.url] = 1106 self._update_restrictions()107 else:108 for link in next_links:109 yield scrapy.Request(link.url, callback=self.parse)110 def _update_restrictions(self):111 self.restricted_sections = [k for k in self.dead_ends.keys() if self.dead_ends[k] > 3]112 def _get_starting_state(self):113 doms = pd.read_csv('domains.csv')114 eligible_doms = doms[np.logical_and(doms['to_crawl'] == 1, doms['crawled'] == 0)]['domain'].tolist()115 if len(eligible_doms) > 0 :116 # take first result117 chosen_dom = eligible_doms[0]118 # update file119 doms.loc[doms['domain'] == chosen_dom, 'crawled'] = 1120 doms.to_csv('domains.csv', index=False)121 return [chosen_dom], [get_domain(chosen_dom)]122 else :123 return [],[]124class DataSetBuilder(scrapy.Spider):125 name = 'dataset_builder'126 start_urls= []127 custom_settings = { 'ITEM_PIPELINES': {'hiv_scraping.pipelines.DataSetPipeline': 300} }128 dom_lbl = pd.read_csv('dataset/dom_lbl.csv')129 def start_requests(self):130 return [scrapy.Request(dom, callback=self.parse) for dom in self._load_domains()]131 def parse(self, response):132 sel = Selector(response = response)133 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()134 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])135 yield {'domain' : trim_url(response.request.url),136 'text_dump' : word_dump,137 'hiv' : self.dom_lbl[self.dom_lbl['domain']==trim_url(response.request.url)]['hiv'].values[0],138 'research': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['research'].values[0],139 'gov': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['gov'].values[0],140 'uni': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['uni'].values[0],141 'ngo': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['ngo'].values[0],142 'association': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['association'].values[0]}143 def _load_domains(self):144 doms = pd.read_csv('dataset/dom_lbl.csv')145 dom_list = doms[doms['hiv'].notnull()]['domain'].tolist()146 return dom_list147 def _has_content(self, txt):148 for t in txt :149 if t not in ['\n', '\t', ' ', '\r'] :150 return True151 return False152class DataSetEnricher(scrapy.Spider):153 # TODO : Change custom setting when not debugging154 name = 'dataset_enricher'155 start_urls = []156 custom_settings = {'ITEM_PIPELINES': {'hiv_scraping.pipelines.EnrichPipeline': 300}}157 dom_lbl = pd.read_csv('dataset/dom_lbl.csv')158 def start_requests(self):159 return [scrapy.Request(dom, callback=self.parse) for dom in self._load_domains()]160 def parse(self, response):161 sel = Selector(response=response)162 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()163 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])164 yield {'domain': trim_url(response.request.url),165 'about_dump': word_dump}166 def _load_domains(self):167 doms = pd.read_csv('dataset/dom_lbl.csv')168 dom_list = doms[doms['hiv'].notnull()]['domain'].tolist()169 about_list = [d + "/about" for d in dom_list] + [d + "/about-us" for d in dom_list]170 return about_list171 def _has_content(self, txt):172 for t in txt:173 if t not in ['\n', '\t', ' ', '\r']:174 return True...
test_tools_web.py
Source:test_tools_web.py
...98ENCLOSING_PAIRS = [('(', ')'), ('[', ']'), ('{', '}'), ('<', '>')]99@pytest.mark.parametrize('trailing_char', TRAILING_CHARS)100def test_trim_url_remove_trailing_char(trailing_char):101 test_url = 'http://example.com/'102 assert trim_url(test_url + trailing_char) == test_url103 # assert trailing_char removed only if it is trailing104 test_url = 'http://example.com/' + trailing_char + 'content'105 assert trim_url(test_url) == test_url106@pytest.mark.parametrize('left, right', ENCLOSING_PAIRS)107def test_trim_url_remove_trailing_enclosing(left, right):108 # right without left => right is removed109 test_url = 'http://example.com/'110 assert test_url == trim_url(test_url + right)111 # right after path without left => right is removed112 test_url = 'http://example.com/a'113 assert test_url == trim_url(test_url + right)114 # trailing left without right => left is kept115 test_url = 'http://example.com/a' + left116 assert test_url == trim_url(test_url)117 # left before content without right => left is kept118 test_url = 'http://example.com/a' + left + 'something'119 assert test_url == trim_url(test_url)120 # left + content + right => right is kept121 assert test_url + right == trim_url(test_url + right)122@pytest.mark.parametrize('trailing_char', TRAILING_CHARS)123@pytest.mark.parametrize('left, right', ENCLOSING_PAIRS)124def test_trim_url_trailing_char_and_enclosing(trailing_char, left, right):125 test_url = 'http://example.com/'126 assert test_url == trim_url(test_url + right + trailing_char)127 # assert the trailing char is kept if there is something else128 test_url = 'http://example.com/' + trailing_char...
html.py
Source:html.py
...57 lead, middle, trail = match.groups()58 if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \59 len(middle) > 0 and middle[0] in string.letters + string.digits and \60 (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):61 middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))62 if middle.startswith('http://') or middle.startswith('https://'):63 middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))64 if '@' in middle and not middle.startswith('www.') and not ':' in middle \65 and simple_email_re.match(middle):66 middle = '<a href="mailto:%s">%s</a>' % (middle, middle)67 if lead + middle + trail != word:68 words[i] = lead + middle + trail69 return ''.join(words)70def clean_html(text):71 """72 Cleans the given HTML. Specifically, it does the following:73 * Converts <b> and <i> to <strong> and <em>.74 * Encodes all ampersands correctly.75 * Removes all "target" attributes from <a> tags.76 * Removes extraneous HTML, such as presentational tags that open and77 immediately close and <br clear="all">....
test_utils.py
Source:test_utils.py
...24 err.get_error()["error"] == data["error"],25 err.get_error()["errcode"] == data["errcode"],26 ]27 )28def test_trim_url():29 """Test trim_url"""30 url = "https://example.com"31 assert trim_url(url) == url32 assert trim_url(f"{url}/") == url33 path = "/foo/bar"34 assert trim_url(path) == path35 assert trim_url(f"{path}/") == path36def get_nodeinfo_index(base: str):37 resp = {38 "links": [39 {40 "href": f"{base}/.well-known/nodeinfo/2.0.json",41 "rel": "http://nodeinfo.diaspora.software/ns/schema/2.0",42 }43 ]44 }45 return (f"{base}/.well-known/nodeinfo", resp)46def get_nodeinfo_resp():47 nodeinfo = {48 "version": "2.0",49 "software": {...
LambdaTest’s Playwright tutorial will give you a broader idea about the Playwright automation framework, its unique features, and use cases with examples to exceed your understanding of Playwright testing. This tutorial will give A to Z guidance, from installing the Playwright framework to some best practices and advanced concepts.
Get 100 minutes of automation test minutes FREE!!