Best Python code snippet using playwright-python
hiv_org2.py
Source: hiv_org2.py
...9from w3lib.html import remove_tags, remove_tags_with_content10def get_domain(url):11 matched = re.match('^(?:http[s]?://)+[^/]*', url).group(0)12 return matched.split('://')[-1]13def trim_url(url):14 matched = re.match('^(?:http[s]?://)+[^/]*', url).group(0)15 return matched16class OrgWebsite(scrapy.Item):17 link = scrapy.Field()18 domain = scrapy.Field()19 referer = scrapy.Field()20class HIVBootstraper(scrapy.Spider):21 #TODO : Change custom setting when not debugging22 name = 'hiv_bootstraper'23 custom_settings = {24 'ITEM_PIPELINES': {'hiv_scraping.pipelines.HivBootstrapScrapingPipeline': 300},25 'CLOSESPIDER_ITEMCOUNT': 10026 }27 saved_domains = []28 dead_ends = {}29 restricted_sections = []30 def __init__(self, **kw):31 super(HIVBootstraper, self).__init__(**kw)32 # self.start_urls = self.__getattribute__()33 # self.allowed_domains = [get_domain(self.start_urls[0])]34 logging.info('Starting Bootstrap Spider with : %s', ', '.join(self.start_urls))35 def parse(self, response):36 links = LinkExtractor(allow=(), deny=self.allowed_domains + self.saved_domains).extract_links(response)37 for link in links:38 if get_domain(link.url) not in self.saved_domains:39 self.saved_domains.append(get_domain(link.url))40 orgwebsite = OrgWebsite(link=link.url, domain=trim_url(link.url),41 referer=trim_url(response.request.url))42 yield orgwebsite43 next_links = LinkExtractor(allow=self.allowed_domains, deny=self.restricted_sections).extract_links(response)44 if len(links) == 0:45 try:46 self.dead_ends[response.request.url] += 147 except:48 self.dead_ends[response.request.url] = 149 self._update_restrictions()50 else:51 for link in next_links:52 yield scrapy.Request(link.url, callback=self.parse)53 def _update_restrictions(self):54 self.restricted_sections = [k for k in self.dead_ends.keys() if self.dead_ends[k] > 3]55class HIVChecker(scrapy.Spider) :56 name = 'hiv_checker'57 start_urls= []58 custom_settings = {59 'ITEM_PIPELINES': {'hiv_scraping.pipelines.ClfHIVPipeline': 300} #CheckHIVPipeline60 }61 def start_requests(self):62 return [scrapy.Request(dom, callback=self.hiv_check) for dom in self._load_domains_to_check()]63 def hiv_check(self, response): #parse method64 sel = Selector(response = response)65 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()66 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])67 yield {'domain' : trim_url(response.request.url),68 'text_dump' : word_dump}69 def _has_content(self, txt):70 for t in txt :71 if t not in ['\n', '\t', ' ', '\r'] :72 return True73 def _load_domains_to_check(self):74 doms = pd.read_csv('domains.csv')75 doms = doms[doms['to_crawl'].isnull()].sort_values(by='references')['domain'].tolist()76 logging.info("%s new domains to be check for HIV" % str(len(doms)))77 return doms78class HIVSatellite(scrapy.Spider):79 #TODO : update the dead-end mechanism to also check whether the new pages are relevant80 name = 'hiv_satellite'81 custom_settings = { 'ITEM_PIPELINES': {'hiv_scraping.pipelines.HivSatScrapingPipeline': 300},82 'CLOSESPIDER_PAGECOUNT' : 500}83 saved_domains = []84 dead_ends = {}85 restricted_sections = []86 def __init__(self, **kw):87 super(HIVSatellite, self).__init__(**kw)88 self.start_urls, self.allowed_domains = self._get_starting_state()89 if len(self.start_urls)==1 :90 logging.info('New satellite spider : %s', self.start_urls[0])91 def parse(self, response):92 # TODO : Find a way to have the exact same logic as the HIVBootstrap spider (maybe just have the exact same type?)93 links = LinkExtractor(allow=(), deny=self.allowed_domains + self.saved_domains).extract_links(response)94 for link in links:95 if get_domain(link.url) not in self.saved_domains:96 self.saved_domains.append(get_domain(link.url))97 orgwebsite = OrgWebsite(link=link.url, domain=trim_url(link.url),98 referer=trim_url(response.request.url))99 yield orgwebsite100 next_links = LinkExtractor(allow=self.allowed_domains, deny=self.restricted_sections).extract_links(response)101 if len(links) == 0:102 try:103 self.dead_ends[response.request.url] += 1104 except:105 self.dead_ends[response.request.url] = 1106 self._update_restrictions()107 else:108 for link in next_links:109 yield scrapy.Request(link.url, callback=self.parse)110 def _update_restrictions(self):111 self.restricted_sections = [k for k in self.dead_ends.keys() if self.dead_ends[k] > 3]112 def _get_starting_state(self):113 doms = pd.read_csv('domains.csv')114 eligible_doms = doms[np.logical_and(doms['to_crawl'] == 1, doms['crawled'] == 0)]['domain'].tolist()115 if len(eligible_doms) > 0 :116 # take first result117 chosen_dom = eligible_doms[0]118 # update file119 doms.loc[doms['domain'] == chosen_dom, 'crawled'] = 1120 doms.to_csv('domains.csv', index=False)121 return [chosen_dom], [get_domain(chosen_dom)]122 else :123 return [],[]124class DataSetBuilder(scrapy.Spider):125 name = 'dataset_builder'126 start_urls= []127 custom_settings = { 'ITEM_PIPELINES': {'hiv_scraping.pipelines.DataSetPipeline': 300} }128 dom_lbl = pd.read_csv('dataset/dom_lbl.csv')129 def start_requests(self):130 return [scrapy.Request(dom, callback=self.parse) for dom in self._load_domains()]131 def parse(self, response):132 sel = Selector(response = response)133 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()134 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])135 yield {'domain' : trim_url(response.request.url),136 'text_dump' : word_dump,137 'hiv' : self.dom_lbl[self.dom_lbl['domain']==trim_url(response.request.url)]['hiv'].values[0],138 'research': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['research'].values[0],139 'gov': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['gov'].values[0],140 'uni': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['uni'].values[0],141 'ngo': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['ngo'].values[0],142 'association': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['association'].values[0]}143 def _load_domains(self):144 doms = pd.read_csv('dataset/dom_lbl.csv')145 dom_list = doms[doms['hiv'].notnull()]['domain'].tolist()146 return dom_list147 def _has_content(self, txt):148 for t in txt :149 if t not in ['\n', '\t', ' ', '\r'] :150 return True151 return False152class DataSetEnricher(scrapy.Spider):153 # TODO : Change custom setting when not debugging154 name = 'dataset_enricher'155 start_urls = []156 custom_settings = {'ITEM_PIPELINES': {'hiv_scraping.pipelines.EnrichPipeline': 300}}157 dom_lbl = pd.read_csv('dataset/dom_lbl.csv')158 def start_requests(self):159 return [scrapy.Request(dom, callback=self.parse) for dom in self._load_domains()]160 def parse(self, response):161 sel = Selector(response=response)162 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()163 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])164 yield {'domain': trim_url(response.request.url),165 'about_dump': word_dump}166 def _load_domains(self):167 doms = pd.read_csv('dataset/dom_lbl.csv')168 dom_list = doms[doms['hiv'].notnull()]['domain'].tolist()169 about_list = [d + "/about" for d in dom_list] + [d + "/about-us" for d in dom_list]170 return about_list171 def _has_content(self, txt):172 for t in txt:173 if t not in ['\n', '\t', ' ', '\r']:174 return True...
test_tools_web.py
Source: test_tools_web.py
...98ENCLOSING_PAIRS = [('(', ')'), ('[', ']'), ('{', '}'), ('<', '>')]99@pytest.mark.parametrize('trailing_char', TRAILING_CHARS)100def test_trim_url_remove_trailing_char(trailing_char):101 test_url = 'http://example.com/'102 assert trim_url(test_url + trailing_char) == test_url103 # assert trailing_char removed only if it is trailing104 test_url = 'http://example.com/' + trailing_char + 'content'105 assert trim_url(test_url) == test_url106@pytest.mark.parametrize('left, right', ENCLOSING_PAIRS)107def test_trim_url_remove_trailing_enclosing(left, right):108 # right without left => right is removed109 test_url = 'http://example.com/'110 assert test_url == trim_url(test_url + right)111 # right after path without left => right is removed112 test_url = 'http://example.com/a'113 assert test_url == trim_url(test_url + right)114 # trailing left without right => left is kept115 test_url = 'http://example.com/a' + left116 assert test_url == trim_url(test_url)117 # left before content without right => left is kept118 test_url = 'http://example.com/a' + left + 'something'119 assert test_url == trim_url(test_url)120 # left + content + right => right is kept121 assert test_url + right == trim_url(test_url + right)122@pytest.mark.parametrize('trailing_char', TRAILING_CHARS)123@pytest.mark.parametrize('left, right', ENCLOSING_PAIRS)124def test_trim_url_trailing_char_and_enclosing(trailing_char, left, right):125 test_url = 'http://example.com/'126 assert test_url == trim_url(test_url + right + trailing_char)127 # assert the trailing char is kept if there is something else128 test_url = 'http://example.com/' + trailing_char...
html.py
Source: html.py
...57 lead, middle, trail = match.groups()58 if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \59 len(middle) > 0 and middle[0] in string.letters + string.digits and \60 (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):61 middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))62 if middle.startswith('http://') or middle.startswith('https://'):63 middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))64 if '@' in middle and not middle.startswith('www.') and not ':' in middle \65 and simple_email_re.match(middle):66 middle = '<a href="mailto:%s">%s</a>' % (middle, middle)67 if lead + middle + trail != word:68 words[i] = lead + middle + trail69 return ''.join(words)70def clean_html(text):71 """72 Cleans the given HTML. Specifically, it does the following:73 * Converts <b> and <i> to <strong> and <em>.74 * Encodes all ampersands correctly.75 * Removes all "target" attributes from <a> tags.76 * Removes extraneous HTML, such as presentational tags that open and77 immediately close and <br clear="all">....
test_utils.py
Source: test_utils.py
...24 err.get_error()["error"] == data["error"],25 err.get_error()["errcode"] == data["errcode"],26 ]27 )28def test_trim_url():29 """Test trim_url"""30 url = "https://example.com"31 assert trim_url(url) == url32 assert trim_url(f"{url}/") == url33 path = "/foo/bar"34 assert trim_url(path) == path35 assert trim_url(f"{path}/") == path36def get_nodeinfo_index(base: str):37 resp = {38 "links": [39 {40 "href": f"{base}/.well-known/nodeinfo/2.0.json",41 "rel": "http://nodeinfo.diaspora.software/ns/schema/2.0",42 }43 ]44 }45 return (f"{base}/.well-known/nodeinfo", resp)46def get_nodeinfo_resp():47 nodeinfo = {48 "version": "2.0",49 "software": {...
Playwright error connection refused in docker
playwright-python advanced setup
How to select an input according to a parent sibling label
Error when installing Microsoft Playwright
Trouble waiting for changes to complete that are triggered by Python Playwright `select_option`
Capturing and Storing Request Data Using Playwright for Python
Can Playwright be used to launch a browser instance
Trouble in Clicking on Log in Google Button of Pop Up Menu Playwright Python
Scrapy Playwright get date by clicking button
React locator example
I solved my problem. In fact my docker container (frontend) is called "app" which is also domain name of fronend application. My application is running locally on http. Chromium and geko drivers force httpS connection for some domain names one of which is "app". So i have to change name for my docker container wich contains frontend application.
Check out the latest blogs from LambdaTest on this topic:
The sky’s the limit (and even beyond that) when you want to run test automation. Technology has developed so much that you can reduce time and stay more productive than you used to 10 years ago. You needn’t put up with the limitations brought to you by Selenium if that’s your go-to automation testing tool. Instead, you can pick from various test automation frameworks and tools to write effective test cases and run them successfully.
When it comes to web automation testing, there are a number of frameworks like Selenium, Cypress, PlayWright, Puppeteer, etc., that make it to the ‘preferred list’ of frameworks. The choice of test automation framework depends on a range of parameters like type, complexity, scale, along with the framework expertise available within the team. However, it’s no surprise that Selenium is still the most preferred framework among developers and QAs.
Playwright is a framework that I’ve always heard great things about but never had a chance to pick up until earlier this year. And since then, it’s become one of my favorite test automation frameworks to use when building a new automation project. It’s easy to set up, feature-packed, and one of the fastest, most reliable frameworks I’ve worked with.
The speed at which tests are executed and the “dearth of smartness” in testing are the two major problems developers and testers encounter.
With the rapidly evolving technology due to its ever-increasing demand in today’s world, Digital Security has become a major concern for the Software Industry. There are various ways through which Digital Security can be achieved, Captcha being one of them.Captcha is easy for humans to solve but hard for “bots” and other malicious software to figure out. However, Captcha has always been tricky for the testers to automate, as many of them don’t know how to handle captcha in Selenium or using any other test automation framework.
LambdaTest’s Playwright tutorial will give you a broader idea about the Playwright automation framework, its unique features, and use cases with examples to exceed your understanding of Playwright testing. This tutorial will give A to Z guidance, from installing the Playwright framework to some best practices and advanced concepts.
Get 100 minutes of automation test minutes FREE!!