Best Python code snippet using playwright-python
hiv_org2.py
Source: hiv_org2.py
...9from w3lib.html import remove_tags, remove_tags_with_content10def get_domain(url):11 matched = re.match('^(?:http[s]?://)+[^/]*', url).group(0)12 return matched.split('://')[-1]13def trim_url(url):14 matched = re.match('^(?:http[s]?://)+[^/]*', url).group(0)15 return matched16class OrgWebsite(scrapy.Item):17 link = scrapy.Field()18 domain = scrapy.Field()19 referer = scrapy.Field()20class HIVBootstraper(scrapy.Spider):21 #TODO : Change custom setting when not debugging22 name = 'hiv_bootstraper'23 custom_settings = {24 'ITEM_PIPELINES': {'hiv_scraping.pipelines.HivBootstrapScrapingPipeline': 300},25 'CLOSESPIDER_ITEMCOUNT': 10026 }27 saved_domains = []28 dead_ends = {}29 restricted_sections = []30 def __init__(self, **kw):31 super(HIVBootstraper, self).__init__(**kw)32 # self.start_urls = self.__getattribute__()33 # self.allowed_domains = [get_domain(self.start_urls[0])]34 logging.info('Starting Bootstrap Spider with : %s', ', '.join(self.start_urls))35 def parse(self, response):36 links = LinkExtractor(allow=(), deny=self.allowed_domains + self.saved_domains).extract_links(response)37 for link in links:38 if get_domain(link.url) not in self.saved_domains:39 self.saved_domains.append(get_domain(link.url))40 orgwebsite = OrgWebsite(link=link.url, domain=trim_url(link.url),41 referer=trim_url(response.request.url))42 yield orgwebsite43 next_links = LinkExtractor(allow=self.allowed_domains, deny=self.restricted_sections).extract_links(response)44 if len(links) == 0:45 try:46 self.dead_ends[response.request.url] += 147 except:48 self.dead_ends[response.request.url] = 149 self._update_restrictions()50 else:51 for link in next_links:52 yield scrapy.Request(link.url, callback=self.parse)53 def _update_restrictions(self):54 self.restricted_sections = [k for k in self.dead_ends.keys() if self.dead_ends[k] > 3]55class HIVChecker(scrapy.Spider) :56 name = 'hiv_checker'57 start_urls= []58 custom_settings = {59 'ITEM_PIPELINES': {'hiv_scraping.pipelines.ClfHIVPipeline': 300} #CheckHIVPipeline60 }61 def start_requests(self):62 return [scrapy.Request(dom, callback=self.hiv_check) for dom in self._load_domains_to_check()]63 def hiv_check(self, response): #parse method64 sel = Selector(response = response)65 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()66 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])67 yield {'domain' : trim_url(response.request.url),68 'text_dump' : word_dump}69 def _has_content(self, txt):70 for t in txt :71 if t not in ['\n', '\t', ' ', '\r'] :72 return True73 def _load_domains_to_check(self):74 doms = pd.read_csv('domains.csv')75 doms = doms[doms['to_crawl'].isnull()].sort_values(by='references')['domain'].tolist()76 logging.info("%s new domains to be check for HIV" % str(len(doms)))77 return doms78class HIVSatellite(scrapy.Spider):79 #TODO : update the dead-end mechanism to also check whether the new pages are relevant80 name = 'hiv_satellite'81 custom_settings = { 'ITEM_PIPELINES': {'hiv_scraping.pipelines.HivSatScrapingPipeline': 300},82 'CLOSESPIDER_PAGECOUNT' : 500}83 saved_domains = []84 dead_ends = {}85 restricted_sections = []86 def __init__(self, **kw):87 super(HIVSatellite, self).__init__(**kw)88 self.start_urls, self.allowed_domains = self._get_starting_state()89 if len(self.start_urls)==1 :90 logging.info('New satellite spider : %s', self.start_urls[0])91 def parse(self, response):92 # TODO : Find a way to have the exact same logic as the HIVBootstrap spider (maybe just have the exact same type?)93 links = LinkExtractor(allow=(), deny=self.allowed_domains + self.saved_domains).extract_links(response)94 for link in links:95 if get_domain(link.url) not in self.saved_domains:96 self.saved_domains.append(get_domain(link.url))97 orgwebsite = OrgWebsite(link=link.url, domain=trim_url(link.url),98 referer=trim_url(response.request.url))99 yield orgwebsite100 next_links = LinkExtractor(allow=self.allowed_domains, deny=self.restricted_sections).extract_links(response)101 if len(links) == 0:102 try:103 self.dead_ends[response.request.url] += 1104 except:105 self.dead_ends[response.request.url] = 1106 self._update_restrictions()107 else:108 for link in next_links:109 yield scrapy.Request(link.url, callback=self.parse)110 def _update_restrictions(self):111 self.restricted_sections = [k for k in self.dead_ends.keys() if self.dead_ends[k] > 3]112 def _get_starting_state(self):113 doms = pd.read_csv('domains.csv')114 eligible_doms = doms[np.logical_and(doms['to_crawl'] == 1, doms['crawled'] == 0)]['domain'].tolist()115 if len(eligible_doms) > 0 :116 # take first result117 chosen_dom = eligible_doms[0]118 # update file119 doms.loc[doms['domain'] == chosen_dom, 'crawled'] = 1120 doms.to_csv('domains.csv', index=False)121 return [chosen_dom], [get_domain(chosen_dom)]122 else :123 return [],[]124class DataSetBuilder(scrapy.Spider):125 name = 'dataset_builder'126 start_urls= []127 custom_settings = { 'ITEM_PIPELINES': {'hiv_scraping.pipelines.DataSetPipeline': 300} }128 dom_lbl = pd.read_csv('dataset/dom_lbl.csv')129 def start_requests(self):130 return [scrapy.Request(dom, callback=self.parse) for dom in self._load_domains()]131 def parse(self, response):132 sel = Selector(response = response)133 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()134 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])135 yield {'domain' : trim_url(response.request.url),136 'text_dump' : word_dump,137 'hiv' : self.dom_lbl[self.dom_lbl['domain']==trim_url(response.request.url)]['hiv'].values[0],138 'research': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['research'].values[0],139 'gov': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['gov'].values[0],140 'uni': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['uni'].values[0],141 'ngo': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['ngo'].values[0],142 'association': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['association'].values[0]}143 def _load_domains(self):144 doms = pd.read_csv('dataset/dom_lbl.csv')145 dom_list = doms[doms['hiv'].notnull()]['domain'].tolist()146 return dom_list147 def _has_content(self, txt):148 for t in txt :149 if t not in ['\n', '\t', ' ', '\r'] :150 return True151 return False152class DataSetEnricher(scrapy.Spider):153 # TODO : Change custom setting when not debugging154 name = 'dataset_enricher'155 start_urls = []156 custom_settings = {'ITEM_PIPELINES': {'hiv_scraping.pipelines.EnrichPipeline': 300}}157 dom_lbl = pd.read_csv('dataset/dom_lbl.csv')158 def start_requests(self):159 return [scrapy.Request(dom, callback=self.parse) for dom in self._load_domains()]160 def parse(self, response):161 sel = Selector(response=response)162 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()163 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])164 yield {'domain': trim_url(response.request.url),165 'about_dump': word_dump}166 def _load_domains(self):167 doms = pd.read_csv('dataset/dom_lbl.csv')168 dom_list = doms[doms['hiv'].notnull()]['domain'].tolist()169 about_list = [d + "/about" for d in dom_list] + [d + "/about-us" for d in dom_list]170 return about_list171 def _has_content(self, txt):172 for t in txt:173 if t not in ['\n', '\t', ' ', '\r']:174 return True...
test_tools_web.py
Source: test_tools_web.py
...98ENCLOSING_PAIRS = [('(', ')'), ('[', ']'), ('{', '}'), ('<', '>')]99@pytest.mark.parametrize('trailing_char', TRAILING_CHARS)100def test_trim_url_remove_trailing_char(trailing_char):101 test_url = 'http://example.com/'102 assert trim_url(test_url + trailing_char) == test_url103 # assert trailing_char removed only if it is trailing104 test_url = 'http://example.com/' + trailing_char + 'content'105 assert trim_url(test_url) == test_url106@pytest.mark.parametrize('left, right', ENCLOSING_PAIRS)107def test_trim_url_remove_trailing_enclosing(left, right):108 # right without left => right is removed109 test_url = 'http://example.com/'110 assert test_url == trim_url(test_url + right)111 # right after path without left => right is removed112 test_url = 'http://example.com/a'113 assert test_url == trim_url(test_url + right)114 # trailing left without right => left is kept115 test_url = 'http://example.com/a' + left116 assert test_url == trim_url(test_url)117 # left before content without right => left is kept118 test_url = 'http://example.com/a' + left + 'something'119 assert test_url == trim_url(test_url)120 # left + content + right => right is kept121 assert test_url + right == trim_url(test_url + right)122@pytest.mark.parametrize('trailing_char', TRAILING_CHARS)123@pytest.mark.parametrize('left, right', ENCLOSING_PAIRS)124def test_trim_url_trailing_char_and_enclosing(trailing_char, left, right):125 test_url = 'http://example.com/'126 assert test_url == trim_url(test_url + right + trailing_char)127 # assert the trailing char is kept if there is something else128 test_url = 'http://example.com/' + trailing_char...
html.py
Source: html.py
...57 lead, middle, trail = match.groups()58 if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \59 len(middle) > 0 and middle[0] in string.letters + string.digits and \60 (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):61 middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))62 if middle.startswith('http://') or middle.startswith('https://'):63 middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))64 if '@' in middle and not middle.startswith('www.') and not ':' in middle \65 and simple_email_re.match(middle):66 middle = '<a href="mailto:%s">%s</a>' % (middle, middle)67 if lead + middle + trail != word:68 words[i] = lead + middle + trail69 return ''.join(words)70def clean_html(text):71 """72 Cleans the given HTML. Specifically, it does the following:73 * Converts <b> and <i> to <strong> and <em>.74 * Encodes all ampersands correctly.75 * Removes all "target" attributes from <a> tags.76 * Removes extraneous HTML, such as presentational tags that open and77 immediately close and <br clear="all">....
test_utils.py
Source: test_utils.py
...24 err.get_error()["error"] == data["error"],25 err.get_error()["errcode"] == data["errcode"],26 ]27 )28def test_trim_url():29 """Test trim_url"""30 url = "https://example.com"31 assert trim_url(url) == url32 assert trim_url(f"{url}/") == url33 path = "/foo/bar"34 assert trim_url(path) == path35 assert trim_url(f"{path}/") == path36def get_nodeinfo_index(base: str):37 resp = {38 "links": [39 {40 "href": f"{base}/.well-known/nodeinfo/2.0.json",41 "rel": "http://nodeinfo.diaspora.software/ns/schema/2.0",42 }43 ]44 }45 return (f"{base}/.well-known/nodeinfo", resp)46def get_nodeinfo_resp():47 nodeinfo = {48 "version": "2.0",49 "software": {...
Why did a plawright-python app run in Docker failed? Headless=False?
Why can't I interact (fill, click, etc) with this element using Playwright in my Python code?
How to use playwright and beautifulsoup on web page which has pagination?
how to scrape product data from website pages that uses graphql
What are the differences between Python Playwright sync vs. async APIs?
How to get a list of all links from a dynamic web page?
Install playwright from a local directory
Using Playwright for Python, how do I select an option from a drop down list?
Playwright won't navigate to URL (Python)
Playwright: click on element within one/multiple elements using Python
After investigating and trying several things, looks like the problem is the user_agent of the browser when is in headless mode, for some reason the default user agent does not like to that page, try with:
def extract_html(self):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36')
page.goto('http://book.flygofirst.com/Flight/Select?inl=0&CHD=0&s=True&o1=BOM&d1=BLR&ADT=1&dd1=2022-12-10&gl=0&glo=0&cc=INR&mon=true')
html = page.inner_html('#sectionBody')
return html
Check out the latest blogs from LambdaTest on this topic:
The year 2021 can be encapsulated as one major transition. In 2022, the current breakthroughs in the elusive fight to eliminate the COVID-19 pandemic are top of mind for enterprises globally. At the same time, we are witnessing recent strides in technological advancements as the world gets digitized. As a result, the year 2022 will see the resumption of massive changes in technology and digital transformation, driving firms to adapt and transform themselves perpetually.
Ruby is a programming language which is well suitable for web automation. Ruby makes an excellent choice because of its clean syntax, focus on built-in library integrations, and an active community. Another benefit of Ruby is that it also allows other programming languages like Java, Python, etc. to be used in order to automate applications written in any other frameworks. Therefore you can use Selenium Ruby to automate any sort of application in your system and test the results in any type of testing environment
Playwright is a framework that I’ve always heard great things about but never had a chance to pick up until earlier this year. And since then, it’s become one of my favorite test automation frameworks to use when building a new automation project. It’s easy to set up, feature-packed, and one of the fastest, most reliable frameworks I’ve worked with.
One of the biggest problems I’ve faced when building a test suite is not the writing of the tests but the execution. How can I execute 100s or 1000s of tests in parallel?If I try that on my local machine, it would probably catch fire – so we need a remote environment to send these to.
LambdaTest’s Playwright tutorial will give you a broader idea about the Playwright automation framework, its unique features, and use cases with examples to exceed your understanding of Playwright testing. This tutorial will give A to Z guidance, from installing the Playwright framework to some best practices and advanced concepts.
Get 100 minutes of automation test minutes FREE!!