How to use trim_url method in Playwright Python

Best Python code snippet using playwright-python

hiv_org2.py

Source: hiv_org2.py Github

copy

Full Screen

...9from w3lib.html import remove_tags, remove_tags_with_content10def get_domain(url):11 matched = re.match('^(?:http[s]?:/​/​)+[^/​]*', url).group(0)12 return matched.split(':/​/​')[-1]13def trim_url(url):14 matched = re.match('^(?:http[s]?:/​/​)+[^/​]*', url).group(0)15 return matched16class OrgWebsite(scrapy.Item):17 link = scrapy.Field()18 domain = scrapy.Field()19 referer = scrapy.Field()20class HIVBootstraper(scrapy.Spider):21 #TODO : Change custom setting when not debugging22 name = 'hiv_bootstraper'23 custom_settings = {24 'ITEM_PIPELINES': {'hiv_scraping.pipelines.HivBootstrapScrapingPipeline': 300},25 'CLOSESPIDER_ITEMCOUNT': 10026 }27 saved_domains = []28 dead_ends = {}29 restricted_sections = []30 def __init__(self, **kw):31 super(HIVBootstraper, self).__init__(**kw)32 # self.start_urls = self.__getattribute__()33 # self.allowed_domains = [get_domain(self.start_urls[0])]34 logging.info('Starting Bootstrap Spider with : %s', ', '.join(self.start_urls))35 def parse(self, response):36 links = LinkExtractor(allow=(), deny=self.allowed_domains + self.saved_domains).extract_links(response)37 for link in links:38 if get_domain(link.url) not in self.saved_domains:39 self.saved_domains.append(get_domain(link.url))40 orgwebsite = OrgWebsite(link=link.url, domain=trim_url(link.url),41 referer=trim_url(response.request.url))42 yield orgwebsite43 next_links = LinkExtractor(allow=self.allowed_domains, deny=self.restricted_sections).extract_links(response)44 if len(links) == 0:45 try:46 self.dead_ends[response.request.url] += 147 except:48 self.dead_ends[response.request.url] = 149 self._update_restrictions()50 else:51 for link in next_links:52 yield scrapy.Request(link.url, callback=self.parse)53 def _update_restrictions(self):54 self.restricted_sections = [k for k in self.dead_ends.keys() if self.dead_ends[k] > 3]55class HIVChecker(scrapy.Spider) :56 name = 'hiv_checker'57 start_urls= []58 custom_settings = {59 'ITEM_PIPELINES': {'hiv_scraping.pipelines.ClfHIVPipeline': 300} #CheckHIVPipeline60 }61 def start_requests(self):62 return [scrapy.Request(dom, callback=self.hiv_check) for dom in self._load_domains_to_check()]63 def hiv_check(self, response): #parse method64 sel = Selector(response = response)65 raw_dump = sel.xpath('/​/​body/​descendant-or-self::*[not(self::script)]/​text()').extract()66 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])67 yield {'domain' : trim_url(response.request.url),68 'text_dump' : word_dump}69 def _has_content(self, txt):70 for t in txt :71 if t not in ['\n', '\t', ' ', '\r'] :72 return True73 def _load_domains_to_check(self):74 doms = pd.read_csv('domains.csv')75 doms = doms[doms['to_crawl'].isnull()].sort_values(by='references')['domain'].tolist()76 logging.info("%s new domains to be check for HIV" % str(len(doms)))77 return doms78class HIVSatellite(scrapy.Spider):79 #TODO : update the dead-end mechanism to also check whether the new pages are relevant80 name = 'hiv_satellite'81 custom_settings = { 'ITEM_PIPELINES': {'hiv_scraping.pipelines.HivSatScrapingPipeline': 300},82 'CLOSESPIDER_PAGECOUNT' : 500}83 saved_domains = []84 dead_ends = {}85 restricted_sections = []86 def __init__(self, **kw):87 super(HIVSatellite, self).__init__(**kw)88 self.start_urls, self.allowed_domains = self._get_starting_state()89 if len(self.start_urls)==1 :90 logging.info('New satellite spider : %s', self.start_urls[0])91 def parse(self, response):92 # TODO : Find a way to have the exact same logic as the HIVBootstrap spider (maybe just have the exact same type?)93 links = LinkExtractor(allow=(), deny=self.allowed_domains + self.saved_domains).extract_links(response)94 for link in links:95 if get_domain(link.url) not in self.saved_domains:96 self.saved_domains.append(get_domain(link.url))97 orgwebsite = OrgWebsite(link=link.url, domain=trim_url(link.url),98 referer=trim_url(response.request.url))99 yield orgwebsite100 next_links = LinkExtractor(allow=self.allowed_domains, deny=self.restricted_sections).extract_links(response)101 if len(links) == 0:102 try:103 self.dead_ends[response.request.url] += 1104 except:105 self.dead_ends[response.request.url] = 1106 self._update_restrictions()107 else:108 for link in next_links:109 yield scrapy.Request(link.url, callback=self.parse)110 def _update_restrictions(self):111 self.restricted_sections = [k for k in self.dead_ends.keys() if self.dead_ends[k] > 3]112 def _get_starting_state(self):113 doms = pd.read_csv('domains.csv')114 eligible_doms = doms[np.logical_and(doms['to_crawl'] == 1, doms['crawled'] == 0)]['domain'].tolist()115 if len(eligible_doms) > 0 :116 # take first result117 chosen_dom = eligible_doms[0]118 # update file119 doms.loc[doms['domain'] == chosen_dom, 'crawled'] = 1120 doms.to_csv('domains.csv', index=False)121 return [chosen_dom], [get_domain(chosen_dom)]122 else :123 return [],[]124class DataSetBuilder(scrapy.Spider):125 name = 'dataset_builder'126 start_urls= []127 custom_settings = { 'ITEM_PIPELINES': {'hiv_scraping.pipelines.DataSetPipeline': 300} }128 dom_lbl = pd.read_csv('dataset/​dom_lbl.csv')129 def start_requests(self):130 return [scrapy.Request(dom, callback=self.parse) for dom in self._load_domains()]131 def parse(self, response):132 sel = Selector(response = response)133 raw_dump = sel.xpath('/​/​body/​descendant-or-self::*[not(self::script)]/​text()').extract()134 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])135 yield {'domain' : trim_url(response.request.url),136 'text_dump' : word_dump,137 'hiv' : self.dom_lbl[self.dom_lbl['domain']==trim_url(response.request.url)]['hiv'].values[0],138 'research': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['research'].values[0],139 'gov': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['gov'].values[0],140 'uni': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['uni'].values[0],141 'ngo': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['ngo'].values[0],142 'association': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['association'].values[0]}143 def _load_domains(self):144 doms = pd.read_csv('dataset/​dom_lbl.csv')145 dom_list = doms[doms['hiv'].notnull()]['domain'].tolist()146 return dom_list147 def _has_content(self, txt):148 for t in txt :149 if t not in ['\n', '\t', ' ', '\r'] :150 return True151 return False152class DataSetEnricher(scrapy.Spider):153 # TODO : Change custom setting when not debugging154 name = 'dataset_enricher'155 start_urls = []156 custom_settings = {'ITEM_PIPELINES': {'hiv_scraping.pipelines.EnrichPipeline': 300}}157 dom_lbl = pd.read_csv('dataset/​dom_lbl.csv')158 def start_requests(self):159 return [scrapy.Request(dom, callback=self.parse) for dom in self._load_domains()]160 def parse(self, response):161 sel = Selector(response=response)162 raw_dump = sel.xpath('/​/​body/​descendant-or-self::*[not(self::script)]/​text()').extract()163 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])164 yield {'domain': trim_url(response.request.url),165 'about_dump': word_dump}166 def _load_domains(self):167 doms = pd.read_csv('dataset/​dom_lbl.csv')168 dom_list = doms[doms['hiv'].notnull()]['domain'].tolist()169 about_list = [d + "/​about" for d in dom_list] + [d + "/​about-us" for d in dom_list]170 return about_list171 def _has_content(self, txt):172 for t in txt:173 if t not in ['\n', '\t', ' ', '\r']:174 return True...

Full Screen

Full Screen

test_tools_web.py

Source: test_tools_web.py Github

copy

Full Screen

...98ENCLOSING_PAIRS = [('(', ')'), ('[', ']'), ('{', '}'), ('<', '>')]99@pytest.mark.parametrize('trailing_char', TRAILING_CHARS)100def test_trim_url_remove_trailing_char(trailing_char):101 test_url = 'http:/​/​example.com/​'102 assert trim_url(test_url + trailing_char) == test_url103 # assert trailing_char removed only if it is trailing104 test_url = 'http:/​/​example.com/​' + trailing_char + 'content'105 assert trim_url(test_url) == test_url106@pytest.mark.parametrize('left, right', ENCLOSING_PAIRS)107def test_trim_url_remove_trailing_enclosing(left, right):108 # right without left => right is removed109 test_url = 'http:/​/​example.com/​'110 assert test_url == trim_url(test_url + right)111 # right after path without left => right is removed112 test_url = 'http:/​/​example.com/​a'113 assert test_url == trim_url(test_url + right)114 # trailing left without right => left is kept115 test_url = 'http:/​/​example.com/​a' + left116 assert test_url == trim_url(test_url)117 # left before content without right => left is kept118 test_url = 'http:/​/​example.com/​a' + left + 'something'119 assert test_url == trim_url(test_url)120 # left + content + right => right is kept121 assert test_url + right == trim_url(test_url + right)122@pytest.mark.parametrize('trailing_char', TRAILING_CHARS)123@pytest.mark.parametrize('left, right', ENCLOSING_PAIRS)124def test_trim_url_trailing_char_and_enclosing(trailing_char, left, right):125 test_url = 'http:/​/​example.com/​'126 assert test_url == trim_url(test_url + right + trailing_char)127 # assert the trailing char is kept if there is something else128 test_url = 'http:/​/​example.com/​' + trailing_char...

Full Screen

Full Screen

html.py

Source: html.py Github

copy

Full Screen

...57 lead, middle, trail = match.groups()58 if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http:/​/​') and \59 len(middle) > 0 and middle[0] in string.letters + string.digits and \60 (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):61 middle = '<a href="http:/​/​%s"%s>%s</​a>' % (middle, nofollow_attr, trim_url(middle))62 if middle.startswith('http:/​/​') or middle.startswith('https:/​/​'):63 middle = '<a href="%s"%s>%s</​a>' % (middle, nofollow_attr, trim_url(middle))64 if '@' in middle and not middle.startswith('www.') and not ':' in middle \65 and simple_email_re.match(middle):66 middle = '<a href="mailto:%s">%s</​a>' % (middle, middle)67 if lead + middle + trail != word:68 words[i] = lead + middle + trail69 return ''.join(words)70def clean_html(text):71 """72 Cleans the given HTML. Specifically, it does the following:73 * Converts <b> and <i> to <strong> and <em>.74 * Encodes all ampersands correctly.75 * Removes all "target" attributes from <a> tags.76 * Removes extraneous HTML, such as presentational tags that open and77 immediately close and <br clear="all">....

Full Screen

Full Screen

test_utils.py

Source: test_utils.py Github

copy

Full Screen

...24 err.get_error()["error"] == data["error"],25 err.get_error()["errcode"] == data["errcode"],26 ]27 )28def test_trim_url():29 """Test trim_url"""30 url = "https:/​/​example.com"31 assert trim_url(url) == url32 assert trim_url(f"{url}/​") == url33 path = "/​foo/​bar"34 assert trim_url(path) == path35 assert trim_url(f"{path}/​") == path36def get_nodeinfo_index(base: str):37 resp = {38 "links": [39 {40 "href": f"{base}/​.well-known/​nodeinfo/​2.0.json",41 "rel": "http:/​/​nodeinfo.diaspora.software/​ns/​schema/​2.0",42 }43 ]44 }45 return (f"{base}/​.well-known/​nodeinfo", resp)46def get_nodeinfo_resp():47 nodeinfo = {48 "version": "2.0",49 "software": {...

Full Screen

Full Screen

StackOverFlow community discussions

Questions
Discussion

Why did a plawright-python app run in Docker failed? Headless=False?

Why can&#39;t I interact (fill, click, etc) with this element using Playwright in my Python code?

How to use playwright and beautifulsoup on web page which has pagination?

how to scrape product data from website pages that uses graphql

What are the differences between Python Playwright sync vs. async APIs?

How to get a list of all links from a dynamic web page?

Install playwright from a local directory

Using Playwright for Python, how do I select an option from a drop down list?

Playwright won&#39;t navigate to URL (Python)

Playwright: click on element within one/multiple elements using Python

After investigating and trying several things, looks like the problem is the user_agent of the browser when is in headless mode, for some reason the default user agent does not like to that page, try with:

def extract_html(self):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36')
        page.goto('http://book.flygofirst.com/Flight/Select?inl=0&CHD=0&s=True&o1=BOM&d1=BLR&ADT=1&dd1=2022-12-10&gl=0&glo=0&cc=INR&mon=true')
        html = page.inner_html('#sectionBody')
        return html
https://stackoverflow.com/questions/74266883/why-did-a-plawright-python-app-run-in-docker-failed-headless-false

Blogs

Check out the latest blogs from LambdaTest on this topic:

Our Top 10 Articles Of 2021!

The year 2021 can be encapsulated as one major transition. In 2022, the current breakthroughs in the elusive fight to eliminate the COVID-19 pandemic are top of mind for enterprises globally. At the same time, we are witnessing recent strides in technological advancements as the world gets digitized. As a result, the year 2022 will see the resumption of massive changes in technology and digital transformation, driving firms to adapt and transform themselves perpetually.

Getting Started With Automation Testing Using Selenium Ruby

Ruby is a programming language which is well suitable for web automation. Ruby makes an excellent choice because of its clean syntax, focus on built-in library integrations, and an active community. Another benefit of Ruby is that it also allows other programming languages like Java, Python, etc. to be used in order to automate applications written in any other frameworks. Therefore you can use Selenium Ruby to automate any sort of application in your system and test the results in any type of testing environment

Playwright Tutorial: Getting Started With Playwright Framework

Playwright is a framework that I’ve always heard great things about but never had a chance to pick up until earlier this year. And since then, it’s become one of my favorite test automation frameworks to use when building a new automation project. It’s easy to set up, feature-packed, and one of the fastest, most reliable frameworks I’ve worked with.

How To Run Your First Playwright Test On Cloud

One of the biggest problems I’ve faced when building a test suite is not the writing of the tests but the execution. How can I execute 100s or 1000s of tests in parallel?If I try that on my local machine, it would probably catch fire – so we need a remote environment to send these to.

Playwright tutorial

LambdaTest’s Playwright tutorial will give you a broader idea about the Playwright automation framework, its unique features, and use cases with examples to exceed your understanding of Playwright testing. This tutorial will give A to Z guidance, from installing the Playwright framework to some best practices and advanced concepts.

Chapters:

  1. What is Playwright : Playwright is comparatively new but has gained good popularity. Get to know some history of the Playwright with some interesting facts connected with it.
  2. How To Install Playwright : Learn in detail about what basic configuration and dependencies are required for installing Playwright and run a test. Get a step-by-step direction for installing the Playwright automation framework.
  3. Playwright Futuristic Features: Launched in 2020, Playwright gained huge popularity quickly because of some obliging features such as Playwright Test Generator and Inspector, Playwright Reporter, Playwright auto-waiting mechanism and etc. Read up on those features to master Playwright testing.
  4. What is Component Testing: Component testing in Playwright is a unique feature that allows a tester to test a single component of a web application without integrating them with other elements. Learn how to perform Component testing on the Playwright automation framework.
  5. Inputs And Buttons In Playwright: Every website has Input boxes and buttons; learn about testing inputs and buttons with different scenarios and examples.
  6. Functions and Selectors in Playwright: Learn how to launch the Chromium browser with Playwright. Also, gain a better understanding of some important functions like “BrowserContext,” which allows you to run multiple browser sessions, and “newPage” which interacts with a page.
  7. Handling Alerts and Dropdowns in Playwright : Playwright interact with different types of alerts and pop-ups, such as simple, confirmation, and prompt, and different types of dropdowns, such as single selector and multi-selector get your hands-on with handling alerts and dropdown in Playright testing.
  8. Playwright vs Puppeteer: Get to know about the difference between two testing frameworks and how they are different than one another, which browsers they support, and what features they provide.
  9. Run Playwright Tests on LambdaTest: Playwright testing with LambdaTest leverages test performance to the utmost. You can run multiple Playwright tests in Parallel with the LammbdaTest test cloud. Get a step-by-step guide to run your Playwright test on the LambdaTest platform.
  10. Playwright Python Tutorial: Playwright automation framework support all major languages such as Python, JavaScript, TypeScript, .NET and etc. However, there are various advantages to Python end-to-end testing with Playwright because of its versatile utility. Get the hang of Playwright python testing with this chapter.
  11. Playwright End To End Testing Tutorial: Get your hands on with Playwright end-to-end testing and learn to use some exciting features such as TraceViewer, Debugging, Networking, Component testing, Visual testing, and many more.
  12. Playwright Video Tutorial: Watch the video tutorials on Playwright testing from experts and get a consecutive in-depth explanation of Playwright automation testing.

Run Playwright Python automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful