Best Python code snippet using lemoncheesecake
crawler.py
Source:crawler.py
...16from urllib3.exceptions import LocationParseError17from urllib.parse import urlparse, urljoin18class UrlUtilsMixin:19 @staticmethod20 def _normalize_link(link, root_url: str) -> str:21 try:22 parsed_url = urlparse(link)23 except ValueError:24 return None25 parsed_root_url = urlparse(root_url)26 if link.startswith("//"):27 return f"{parsed_root_url.scheme}://{parsed_url.netloc}{parsed_url.path}"28 if not parsed_url.scheme:29 return urljoin(root_url, link)30 return link31 @staticmethod32 def _is_valid_url(url: str) -> bool:33 regex = re.compile(34 r'^(?:http|ftp)s?://' 35 r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' 36 r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' 37 r'(?::\d+)?' 38 r'(?:/?|[/?]\S+)$', re.IGNORECASE)39 return re.match(regex, url) is not None40class Crawler(UrlUtilsMixin):41 """42 Crawler thar browses random pages from a given set of links and stores them into the 43 `cache` folder.44 """ 45 _links: Set[str]46 _blacklist: Set[str]47 48 def __init__(self, links: int, parallel: int, cache: bool) -> None:49 self._links = set()50 self._blacklist = set()51 self._total_links = links52 self._parallel = parallel53 self._cache = cache54 if self._cache:55 dir_path = os.path.dirname(os.path.realpath(__file__))56 self._cache_dir = str(Path(dir_path) / 'cache')57 os.system(f'mkdir -p {self._cache_dir}')58 59 async def _request(self, url: str) -> Optional[str]:60 try:61 async with aiohttp.ClientSession() as session:62 async with session.get(url, timeout=5) as response:63 return await response.text()64 except:65 logging.debug("Exception on URL: %s" % url)66 return None67 def _is_blacklisted(self, url: str) -> bool:68 return url in self._blacklist69 def _should_accept_url(self, url: str) -> bool:70 return url and self._is_valid_url(url) and not self._is_blacklisted(url)71 def _extract_urls(self, body: str, root_url: str) -> List[str]:72 pattern = r"href=[\"'](?!#)(.*?)[\"'].*?" 73 urls = re.findall(pattern, str(body))74 normalize_urls = [self._normalize_link(url, root_url) for url in urls]75 filtered_urls = list(filter(self._should_accept_url, normalize_urls))76 return filtered_urls77 def load_config_file(self, file_path: str) -> None:78 """79 Load a configuration file with blacklisted urls and root urls for starting the 80 crawler.81 """82 with open(file_path, 'r') as config_file:83 config = json.load(config_file)84 for link in config['blacklisted_urls']:85 self._blacklist.add(link)86 for link in config["root_urls"]:87 self._links.add(link)88 async def browse(self) -> None:...
test_crawler_blocking.py
Source:test_crawler_blocking.py
...64 )65 )66 filter_result.sort()67 assert filter_result == checkup_html[result]68def test_crawler_normalize_link():69 cr = crawler.Crawler("https://google.com", "")70 assert (71 cr._normalize_link("https://google.com/help", "https://google.com/")72 == "https://google.com/help"73 )74 assert (75 cr._normalize_link("/help", "https://google.com/")76 == "https://google.com/help"77 )78 assert (79 cr._normalize_link(80 "https://mail.google.com/help#fragment", "https://google.com/"81 )82 == "https://mail.google.com/help"83 )84def test_remove_query():85 cr = crawler.Crawler("https://google.com", "")86 assert (87 cr._remove_query("https://google.com/search?page=42")88 == "https://google.com/search"...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!