Best Python code snippet using selene_python
test_tree.py
Source: test_tree.py
1import datetime2import difflib3import textwrap4from decimal import Decimal5from email.utils import format_datetime6from unittest import TestCase7import requests_mock8from dateutil.tz import tzoffset9from tests.helpers import gzip10from usp.log import create_logger11from usp.objects.page import (12 SitemapPage,13 SitemapNewsStory,14 SitemapPageChangeFrequency,15)16from usp.objects.sitemap import (17 IndexRobotsTxtSitemap,18 PagesXMLSitemap,19 IndexXMLSitemap,20 InvalidSitemap,21 PagesTextSitemap,22 IndexWebsiteSitemap,23 PagesRSSSitemap,24 PagesAtomSitemap,25)26from usp.tree import sitemap_tree_for_homepage27# FIXME various exotic properties28# FIXME XML vulnerabilities with Expat29# FIXME max. recursion level30# FIXME tests responses that are too big31log = create_logger(__name__)32class TestSitemapTree(TestCase):33 TEST_BASE_URL = 'http:/โ/โtest_ultimate-sitemap-parser.com' # mocked by HTTPretty34 # Publication /โ "last modified" date35 TEST_DATE_DATETIME = datetime.datetime(36 year=2009, month=12, day=17, hour=12, minute=4, second=56,37 tzinfo=tzoffset(None, 7200),38 )39 TEST_DATE_STR_ISO8601 = TEST_DATE_DATETIME.isoformat()40 """Test string date formatted as ISO 8601 (for XML and Atom 0.3 /โ 1.0 sitemaps)."""41 TEST_DATE_STR_RFC2822 = format_datetime(TEST_DATE_DATETIME)42 """Test string date formatted as RFC 2822 (for RSS 2.0 sitemaps)."""43 TEST_PUBLICATION_NAME = 'Test publication'44 TEST_PUBLICATION_LANGUAGE = 'en'45 @staticmethod46 def fallback_to_404_not_found_matcher(request):47 """Reply with "404 Not Found" to unmatched URLs instead of throwing NoMockAddress."""48 return requests_mock.create_response(49 request,50 status_code=404,51 reason='Not Found',52 headers={'Content-Type': 'text/โhtml'},53 text="<h1>404 Not Found!</โh1>",54 )55 # noinspection DuplicatedCode56 def test_sitemap_tree_for_homepage(self):57 """Test sitemap_tree_for_homepage()."""58 with requests_mock.Mocker() as m:59 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)60 m.get(61 self.TEST_BASE_URL + '/โ',62 text='This is a homepage.',63 )64 m.get(65 self.TEST_BASE_URL + '/โrobots.txt',66 headers={'Content-Type': 'text/โplain'},67 text=textwrap.dedent("""68 User-agent: *69 Disallow: /โwhatever70 71 Sitemap: {base_url}/โsitemap_pages.xml72 73 # Intentionally spelled as "Site-map" as Google tolerates this:74 # https:/โ/โgithub.com/โgoogle/โrobotstxt/โblob/โmaster/โrobots.cc#L703 75 Site-map: {base_url}/โsitemap_news_index_1.xml76 """.format(base_url=self.TEST_BASE_URL)).strip(),77 )78 # One sitemap for random static pages79 m.get(80 self.TEST_BASE_URL + '/โsitemap_pages.xml',81 headers={'Content-Type': 'application/โxml'},82 text=textwrap.dedent("""83 <?xml version="1.0" encoding="UTF-8"?>84 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9">85 <url>86 <loc>{base_url}/โabout.html</โloc>87 <lastmod>{last_modified_date}</โlastmod>88 <changefreq>monthly</โchangefreq>89 <priority>0.8</โpriority>90 </โurl>91 <url>92 <loc>{base_url}/โcontact.html</โloc>93 <lastmod>{last_modified_date}</โlastmod>94 95 <!-- Invalid change frequency -->96 <changefreq>when we feel like it</โchangefreq>97 98 <!-- Invalid priority -->99 <priority>1.1</โpriority>100 101 </โurl>102 </โurlset>103 """.format(base_url=self.TEST_BASE_URL, last_modified_date=self.TEST_DATE_STR_ISO8601)).strip(),104 )105 # Index sitemap pointing to sitemaps with stories106 m.get(107 self.TEST_BASE_URL + '/โsitemap_news_index_1.xml',108 headers={'Content-Type': 'application/โxml'},109 text=textwrap.dedent("""110 <?xml version="1.0" encoding="UTF-8"?>111 <sitemapindex xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9">112 <sitemap>113 <loc>{base_url}/โsitemap_news_1.xml</โloc>114 <lastmod>{last_modified}</โlastmod>115 </โsitemap>116 <sitemap>117 <loc>{base_url}/โsitemap_news_index_2.xml</โloc>118 <lastmod>{last_modified}</โlastmod>119 </โsitemap>120 </โsitemapindex>121 """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(),122 )123 # First sitemap with actual stories124 m.get(125 self.TEST_BASE_URL + '/โsitemap_news_1.xml',126 headers={'Content-Type': 'application/โxml'},127 text=textwrap.dedent("""128 <?xml version="1.0" encoding="UTF-8"?>129 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9"130 xmlns:news="http:/โ/โwww.google.com/โschemas/โsitemap-news/โ0.9"131 xmlns:xhtml="http:/โ/โwww.w3.org/โ1999/โxhtml">132 133 <url>134 <loc>{base_url}/โnews/โfoo.html</โloc>135 136 <!-- Element present but empty -->137 <lastmod /โ>138 139 <!-- Some other XML namespace -->140 <xhtml:link rel="alternate"141 media="only screen and (max-width: 640px)"142 href="{base_url}/โnews/โfoo.html?mobile=1" /โ>143 144 <news:news>145 <news:publication>146 <news:name>{publication_name}</โnews:name>147 <news:language>{publication_language}</โnews:language>148 </โnews:publication>149 <news:publication_date>{publication_date}</โnews:publication_date>150 <news:title>Foo <foo></โnews:title> <!-- HTML entity decoding -->151 </โnews:news>152 </โurl>153 154 <!-- Has a duplicate story in /โsitemap_news_2.xml -->155 <url>156 <loc>{base_url}/โnews/โbar.html</โloc>157 <xhtml:link rel="alternate"158 media="only screen and (max-width: 640px)"159 href="{base_url}/โnews/โbar.html?mobile=1" /โ>160 <news:news>161 <news:publication>162 <news:name>{publication_name}</โnews:name>163 <news:language>{publication_language}</โnews:language>164 </โnews:publication>165 <news:publication_date>{publication_date}</โnews:publication_date>166 <news:title>Bar & bar</โnews:title>167 </โnews:news>168 </โurl>169 170 </โurlset>171 """.format(172 base_url=self.TEST_BASE_URL,173 publication_name=self.TEST_PUBLICATION_NAME,174 publication_language=self.TEST_PUBLICATION_LANGUAGE,175 publication_date=self.TEST_DATE_STR_ISO8601,176 )).strip(),177 )178 # Another index sitemap pointing to a second sitemaps with stories179 m.get(180 self.TEST_BASE_URL + '/โsitemap_news_index_2.xml',181 headers={'Content-Type': 'application/โxml'},182 text=textwrap.dedent("""183 <?xml version="1.0" encoding="UTF-8"?>184 <sitemapindex xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9">185 186 <sitemap>187 <!-- Extra whitespace added around URL -->188 <loc> {base_url}/โsitemap_news_2.xml </โloc>189 <lastmod>{last_modified}</โlastmod>190 </โsitemap>191 192 <!-- Nonexistent sitemap -->193 <sitemap>194 <loc>{base_url}/โsitemap_news_missing.xml</โloc>195 <lastmod>{last_modified}</โlastmod>196 </โsitemap>197 198 </โsitemapindex>199 """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(),200 )201 # Second sitemap with actual stories202 m.get(203 self.TEST_BASE_URL + '/โsitemap_news_2.xml',204 headers={'Content-Type': 'application/โxml'},205 text=textwrap.dedent("""206 <?xml version="1.0" encoding="UTF-8"?>207 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9"208 xmlns:news="http:/โ/โwww.google.com/โschemas/โsitemap-news/โ0.9"209 xmlns:xhtml="http:/โ/โwww.w3.org/โ1999/โxhtml">210 211 <!-- Has a duplicate story in /โsitemap_news_1.xml -->212 <url>213 <!-- Extra whitespace added around URL -->214 <loc> {base_url}/โnews/โbar.html </โloc>215 <xhtml:link rel="alternate"216 media="only screen and (max-width: 640px)"217 href="{base_url}/โnews/โbar.html?mobile=1#fragment_is_to_be_removed" /โ>218 <news:news>219 <news:publication>220 <news:name>{publication_name}</โnews:name>221 <news:language>{publication_language}</โnews:language>222 </โnews:publication>223 <news:publication_date>{publication_date}</โnews:publication_date>224 225 <tag_without_inner_character_data name="value" /โ>226 227 <news:title>Bar & bar</โnews:title>228 </โnews:news>229 </โurl>230 231 <url>232 <loc>{base_url}/โnews/โbaz.html</โloc>233 <xhtml:link rel="alternate"234 media="only screen and (max-width: 640px)"235 href="{base_url}/โnews/โbaz.html?mobile=1" /โ>236 <news:news>237 <news:publication>238 <news:name>{publication_name}</โnews:name>239 <news:language>{publication_language}</โnews:language>240 </โnews:publication>241 <news:publication_date>{publication_date}</โnews:publication_date>242 <news:title><![CDATA[Bรย
ร
ยพ]]></โnews:title> <!-- CDATA and UTF-8 -->243 </โnews:news>244 </โurl>245 246 </โurlset>247 """.format(248 base_url=self.TEST_BASE_URL,249 publication_name=self.TEST_PUBLICATION_NAME,250 publication_language=self.TEST_PUBLICATION_LANGUAGE,251 publication_date=self.TEST_DATE_STR_ISO8601,252 )).strip(),253 )254 # Nonexistent sitemap255 m.get(256 self.TEST_BASE_URL + '/โsitemap_news_missing.xml',257 status_code=404,258 reason='Not Found',259 headers={'Content-Type': 'text/โhtml'},260 text="<h1>404 Not Found!</โh1>",261 )262 expected_sitemap_tree = IndexWebsiteSitemap(263 url='{}/โ'.format(self.TEST_BASE_URL),264 sub_sitemaps=[265 IndexRobotsTxtSitemap(266 url='{}/โrobots.txt'.format(self.TEST_BASE_URL),267 sub_sitemaps=[268 PagesXMLSitemap(269 url='{}/โsitemap_pages.xml'.format(self.TEST_BASE_URL),270 pages=[271 SitemapPage(272 url='{}/โabout.html'.format(self.TEST_BASE_URL),273 last_modified=self.TEST_DATE_DATETIME,274 news_story=None,275 change_frequency=SitemapPageChangeFrequency.MONTHLY,276 priority=Decimal('0.8'),277 ),278 SitemapPage(279 url='{}/โcontact.html'.format(self.TEST_BASE_URL),280 last_modified=self.TEST_DATE_DATETIME,281 news_story=None,282 # Invalid input -- should be reset to "always"283 change_frequency=SitemapPageChangeFrequency.ALWAYS,284 # Invalid input -- should be reset to 0.5 (the default as per the spec)285 priority=Decimal('0.5'),286 )287 ],288 ),289 IndexXMLSitemap(290 url='{}/โsitemap_news_index_1.xml'.format(self.TEST_BASE_URL),291 sub_sitemaps=[292 PagesXMLSitemap(293 url='{}/โsitemap_news_1.xml'.format(self.TEST_BASE_URL),294 pages=[295 SitemapPage(296 url='{}/โnews/โfoo.html'.format(self.TEST_BASE_URL),297 news_story=SitemapNewsStory(298 title='Foo <foo>',299 publish_date=self.TEST_DATE_DATETIME,300 publication_name=self.TEST_PUBLICATION_NAME,301 publication_language=self.TEST_PUBLICATION_LANGUAGE,302 ),303 ),304 SitemapPage(305 url='{}/โnews/โbar.html'.format(self.TEST_BASE_URL),306 news_story=SitemapNewsStory(307 title='Bar & bar',308 publish_date=self.TEST_DATE_DATETIME,309 publication_name=self.TEST_PUBLICATION_NAME,310 publication_language=self.TEST_PUBLICATION_LANGUAGE,311 ),312 ),313 ]314 ),315 IndexXMLSitemap(316 url='{}/โsitemap_news_index_2.xml'.format(self.TEST_BASE_URL),317 sub_sitemaps=[318 PagesXMLSitemap(319 url='{}/โsitemap_news_2.xml'.format(self.TEST_BASE_URL),320 pages=[321 SitemapPage(322 url='{}/โnews/โbar.html'.format(self.TEST_BASE_URL),323 news_story=SitemapNewsStory(324 title='Bar & bar',325 publish_date=self.TEST_DATE_DATETIME,326 publication_name=self.TEST_PUBLICATION_NAME,327 publication_language=self.TEST_PUBLICATION_LANGUAGE,328 ),329 ),330 SitemapPage(331 url='{}/โnews/โbaz.html'.format(self.TEST_BASE_URL),332 news_story=SitemapNewsStory(333 title='Bรย
ร
ยพ',334 publish_date=self.TEST_DATE_DATETIME,335 publication_name=self.TEST_PUBLICATION_NAME,336 publication_language=self.TEST_PUBLICATION_LANGUAGE,337 ),338 ),339 ],340 ),341 InvalidSitemap(342 url='{}/โsitemap_news_missing.xml'.format(self.TEST_BASE_URL),343 reason=(344 'Unable to fetch sitemap from {base_url}/โsitemap_news_missing.xml: '345 '404 Not Found'346 ).format(base_url=self.TEST_BASE_URL),347 ),348 ],349 ),350 ],351 ),352 ],353 )354 ]355 )356 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)357 expected_lines = str(expected_sitemap_tree).split()358 actual_lines = str(actual_sitemap_tree).split()359 diff = difflib.ndiff(expected_lines, actual_lines)360 diff_str = '\n'.join(diff)361 assert expected_sitemap_tree == actual_sitemap_tree, diff_str362 assert len(list(actual_sitemap_tree.all_pages())) == 6363 def test_sitemap_tree_for_homepage_gzip(self):364 """Test sitemap_tree_for_homepage() with gzipped sitemaps."""365 with requests_mock.Mocker() as m:366 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)367 m.get(368 self.TEST_BASE_URL + '/โ',369 text='This is a homepage.',370 )371 m.get(372 self.TEST_BASE_URL + '/โrobots.txt',373 headers={'Content-Type': 'text/โplain'},374 text=textwrap.dedent("""375 User-agent: *376 Disallow: /โwhatever377 378 Sitemap: {base_url}/โsitemap_1.gz379 Sitemap: {base_url}/โsitemap_2.dat380 Sitemap: {base_url}/โsitemap_3.xml.gz381 """.format(base_url=self.TEST_BASE_URL)).strip(),382 )383 # Gzipped sitemap without correct HTTP header but with .gz extension384 m.get(385 self.TEST_BASE_URL + '/โsitemap_1.gz',386 content=gzip(textwrap.dedent("""387 <?xml version="1.0" encoding="UTF-8"?>388 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9"389 xmlns:news="http:/โ/โwww.google.com/โschemas/โsitemap-news/โ0.9">390 <url>391 <loc>{base_url}/โnews/โfoo.html</โloc>392 <news:news>393 <news:publication>394 <news:name>{publication_name}</โnews:name>395 <news:language>{publication_language}</โnews:language>396 </โnews:publication>397 <news:publication_date>{publication_date}</โnews:publication_date>398 <news:title>Foo <foo></โnews:title> <!-- HTML entity decoding -->399 </โnews:news>400 </โurl>401 </โurlset>402 """.format(403 base_url=self.TEST_BASE_URL,404 publication_name=self.TEST_PUBLICATION_NAME,405 publication_language=self.TEST_PUBLICATION_LANGUAGE,406 publication_date=self.TEST_DATE_STR_ISO8601,407 )).strip()),408 )409 # Gzipped sitemap with correct HTTP header but without .gz extension410 m.get(411 self.TEST_BASE_URL + '/โsitemap_2.dat',412 headers={'Content-Type': 'application/โx-gzip'},413 content=gzip(textwrap.dedent("""414 <?xml version="1.0" encoding="UTF-8"?>415 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9"416 xmlns:news="http:/โ/โwww.google.com/โschemas/โsitemap-news/โ0.9">417 <url>418 <loc>{base_url}/โnews/โbar.html</โloc>419 <news:news>420 <news:publication>421 <news:name>{publication_name}</โnews:name>422 <news:language>{publication_language}</โnews:language>423 </โnews:publication>424 <news:publication_date>{publication_date}</โnews:publication_date>425 <news:title><![CDATA[Bรย
r]]></โnews:title> <!-- CDATA and UTF-8 -->426 </โnews:news>427 </โurl>428 </โurlset>429 """.format(430 base_url=self.TEST_BASE_URL,431 publication_name=self.TEST_PUBLICATION_NAME,432 publication_language=self.TEST_PUBLICATION_LANGUAGE,433 publication_date=self.TEST_DATE_STR_ISO8601,434 )).strip()),435 )436 # Sitemap which appears to be gzipped (due to extension and Content-Type) but really isn't437 m.get(438 self.TEST_BASE_URL + '/โsitemap_3.xml.gz',439 headers={'Content-Type': 'application/โx-gzip'},440 text=textwrap.dedent("""441 <?xml version="1.0" encoding="UTF-8"?>442 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9"443 xmlns:news="http:/โ/โwww.google.com/โschemas/โsitemap-news/โ0.9">444 <url>445 <loc>{base_url}/โnews/โbaz.html</โloc>446 <news:news>447 <news:publication>448 <news:name>{publication_name}</โnews:name>449 <news:language>{publication_language}</โnews:language>450 </โnews:publication>451 <news:publication_date>{publication_date}</โnews:publication_date>452 <news:title><![CDATA[Bรย
ร
ยพ]]></โnews:title> <!-- CDATA and UTF-8 -->453 </โnews:news>454 </โurl>455 </โurlset>456 """.format(457 base_url=self.TEST_BASE_URL,458 publication_name=self.TEST_PUBLICATION_NAME,459 publication_language=self.TEST_PUBLICATION_LANGUAGE,460 publication_date=self.TEST_DATE_STR_ISO8601,461 )).strip(),462 )463 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)464 # Don't do an in-depth check, we just need to make sure that gunzip works465 assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)466 assert len(actual_sitemap_tree.sub_sitemaps) == 1467 assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)468 # noinspection PyUnresolvedReferences469 assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 3470 # noinspection PyUnresolvedReferences471 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]472 assert isinstance(sitemap_1, PagesXMLSitemap)473 assert len(sitemap_1.pages) == 1474 # noinspection PyUnresolvedReferences475 sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]476 assert isinstance(sitemap_2, PagesXMLSitemap)477 assert len(sitemap_2.pages) == 1478 # noinspection PyUnresolvedReferences479 sitemap_3 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[2]480 assert isinstance(sitemap_3, PagesXMLSitemap)481 assert len(sitemap_3.pages) == 1482 def test_sitemap_tree_for_homepage_plain_text(self):483 """Test sitemap_tree_for_homepage() with plain text sitemaps."""484 with requests_mock.Mocker() as m:485 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)486 m.get(487 self.TEST_BASE_URL + '/โ',488 text='This is a homepage.',489 )490 m.get(491 self.TEST_BASE_URL + '/โrobots.txt',492 headers={'Content-Type': 'text/โplain'},493 text=textwrap.dedent("""494 User-agent: *495 Disallow: /โwhatever496 497 Sitemap: {base_url}/โsitemap_1.txt498 Sitemap: {base_url}/โsitemap_2.txt.dat499 """.format(base_url=self.TEST_BASE_URL)).strip(),500 )501 # Plain text uncompressed sitemap (no Content-Type header)502 m.get(503 self.TEST_BASE_URL + '/โsitemap_1.txt',504 text=textwrap.dedent("""505 506 {base_url}/โnews/โfoo.html507 508 509 {base_url}/โnews/โbar.html510 511 Some other stuff which totally doesn't look like an URL512 """.format(base_url=self.TEST_BASE_URL)).strip(),513 )514 # Plain text compressed sitemap without .gz extension515 m.get(516 self.TEST_BASE_URL + '/โsitemap_2.txt.dat',517 headers={'Content-Type': 'application/โx-gzip'},518 content=gzip(textwrap.dedent("""519 {base_url}/โnews/โbar.html520 {base_url}/โnews/โbaz.html521 """.format(base_url=self.TEST_BASE_URL)).strip()),522 )523 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)524 assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)525 assert len(actual_sitemap_tree.sub_sitemaps) == 1526 assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)527 # noinspection PyUnresolvedReferences528 assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2529 # noinspection PyUnresolvedReferences530 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]531 assert isinstance(sitemap_1, PagesTextSitemap)532 assert len(sitemap_1.pages) == 2533 # noinspection PyUnresolvedReferences534 sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]535 assert isinstance(sitemap_2, PagesTextSitemap)536 assert len(sitemap_2.pages) == 2537 pages = list(actual_sitemap_tree.all_pages())538 assert len(pages) == 4539 assert SitemapPage(url='{}/โnews/โfoo.html'.format(self.TEST_BASE_URL)) in pages540 assert SitemapPage(url='{}/โnews/โbar.html'.format(self.TEST_BASE_URL)) in pages541 assert SitemapPage(url='{}/โnews/โbaz.html'.format(self.TEST_BASE_URL)) in pages542 # noinspection DuplicatedCode543 def test_sitemap_tree_for_homepage_rss_atom(self):544 """Test sitemap_tree_for_homepage() with RSS 2.0 /โ Atom 0.3 /โ Atom 1.0 feeds."""545 with requests_mock.Mocker() as m:546 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)547 m.get(548 self.TEST_BASE_URL + '/โ',549 text='This is a homepage.',550 )551 m.get(552 self.TEST_BASE_URL + '/โrobots.txt',553 headers={'Content-Type': 'text/โplain'},554 text=textwrap.dedent("""555 User-agent: *556 Disallow: /โwhatever557 Sitemap: {base_url}/โsitemap_rss.xml558 Sitemap: {base_url}/โsitemap_atom_0_3.xml559 Sitemap: {base_url}/โsitemap_atom_1_0.xml560 """.format(base_url=self.TEST_BASE_URL)).strip(),561 )562 # RSS 2.0 sitemap563 m.get(564 self.TEST_BASE_URL + '/โsitemap_rss.xml',565 headers={'Content-Type': 'application/โrss+xml'},566 text=textwrap.dedent("""567 <?xml version="1.0" encoding="UTF-8"?>568 <rss version="2.0">569 <channel>570 <title>Test RSS 2.0 feed</โtitle>571 <description>This is a test RSS 2.0 feed.</โdescription>572 <link>{base_url}</โlink>573 <pubDate>{pub_date}</โpubDate>574 <item>575 <title>Test RSS 2.0 story #1</โtitle>576 <description>This is a test RSS 2.0 story #1.</โdescription>577 <link>{base_url}/โrss_story_1.html</โlink>578 <guid isPermaLink="true">{base_url}/โrss_story_1.html</โguid>579 <pubDate>{pub_date}</โpubDate>580 </โitem>581 <item>582 <title>Test RSS 2.0 story #2</โtitle>583 <description>This is a test RSS 2.0 story #2.</โdescription>584 <link>{base_url}/โrss_story_2.html</โlink>585 <guid isPermaLink="true">{base_url}/โrss_story_2.html</โguid>586 <pubDate>{pub_date}</โpubDate>587 </โitem>588 </โchannel>589 </โrss>590 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(),591 )592 # Atom 0.3 sitemap593 m.get(594 self.TEST_BASE_URL + '/โsitemap_atom_0_3.xml',595 headers={'Content-Type': 'application/โatom+xml'},596 text=textwrap.dedent("""597 <?xml version="1.0" encoding="UTF-8"?>598 <feed version="0.3" xmlns="http:/โ/โpurl.org/โatom/โns#">599 <title>Test Atom 0.3 feed</โtitle>600 <link rel="alternate" type="text/โhtml" href="{base_url}" /โ>601 <modified>{pub_date}</โmodified>602 <entry>603 <title>Test Atom 0.3 story #1</โtitle>604 <link rel="alternate" type="text/โhtml" href="{base_url}/โatom_0_3_story_1.html" /โ>605 <id>{base_url}/โatom_0_3_story_1.html</โid>606 <issued>{pub_date}</โissued>607 </โentry>608 <entry>609 <title>Test Atom 0.3 story #2</โtitle>610 <link rel="alternate" type="text/โhtml" href="{base_url}/โatom_0_3_story_2.html" /โ>611 <id>{base_url}/โatom_0_3_story_2.html</โid>612 <issued>{pub_date}</โissued>613 </โentry>614 </โfeed>615 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),616 )617 # Atom 1.0 sitemap618 m.get(619 self.TEST_BASE_URL + '/โsitemap_atom_1_0.xml',620 headers={'Content-Type': 'application/โatom+xml'},621 text=textwrap.dedent("""622 <?xml version="1.0" encoding="UTF-8"?>623 <feed xmlns="http:/โ/โwww.w3.org/โ2005/โAtom">624 <title>Test Atom 1.0 feed</โtitle>625 <subtitle>This is a test Atom 1.0 feed.</โsubtitle>626 <link href="{base_url}/โsitemap_atom_1_0.xml" rel="self" /โ>627 <link href="{base_url}" /โ>628 <id>{base_url}</โid>629 <updated>{pub_date}</โupdated>630 <entry>631 <title>Test Atom 1.0 story #1</โtitle>632 <link href="{base_url}/โatom_1_0_story_1.html" /โ>633 <link rel="alternate" type="text/โhtml" href="{base_url}/โatom_1_0_story_1.html?alt" /โ>634 <link rel="edit" href="{base_url}/โatom_1_0_story_1.html?edit" /โ>635 <id>{base_url}/โatom_1_0_story_1.html</โid>636 <updated>{pub_date}</โupdated>637 <summary>This is test atom 1.0 story #1.</โsummary>638 <content type="xhtml">639 <div xmlns="http:/โ/โwww.w3.org/โ1999/โxhtml">640 <p>This is test atom 1.0 story #1.</โp>641 </โdiv>642 </โcontent>643 <author>644 <name>John Doe</โname>645 <email>johndoe@example.com</โemail>646 </โauthor>647 </โentry>648 <entry>649 <title>Test Atom 1.0 story #2</โtitle>650 <link href="{base_url}/โatom_1_0_story_2.html" /โ>651 <link rel="alternate" type="text/โhtml" href="{base_url}/โatom_1_0_story_2.html?alt" /โ>652 <link rel="edit" href="{base_url}/โatom_1_0_story_2.html?edit" /โ>653 <id>{base_url}/โatom_1_0_story_2.html</โid>654 <updated>{pub_date}</โupdated>655 <summary>This is test atom 1.0 story #2.</โsummary>656 <content type="xhtml">657 <div xmlns="http:/โ/โwww.w3.org/โ1999/โxhtml">658 <p>This is test atom 1.0 story #2.</โp>659 </โdiv>660 </โcontent>661 <author>662 <name>John Doe</โname>663 <email>johndoe@example.com</โemail>664 </โauthor>665 </โentry>666 </โfeed>667 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),668 )669 expected_sitemap_tree = IndexWebsiteSitemap(670 url='{}/โ'.format(self.TEST_BASE_URL),671 sub_sitemaps=[672 IndexRobotsTxtSitemap(673 url='{}/โrobots.txt'.format(self.TEST_BASE_URL),674 sub_sitemaps=[675 PagesRSSSitemap(676 url='{}/โsitemap_rss.xml'.format(self.TEST_BASE_URL),677 pages=[678 SitemapPage(679 url='{}/โrss_story_1.html'.format(self.TEST_BASE_URL),680 news_story=SitemapNewsStory(681 title='Test RSS 2.0 story #1',682 publish_date=self.TEST_DATE_DATETIME,683 ),684 ),685 SitemapPage(686 url='{}/โrss_story_2.html'.format(self.TEST_BASE_URL),687 news_story=SitemapNewsStory(688 title='Test RSS 2.0 story #2',689 publish_date=self.TEST_DATE_DATETIME,690 )691 )692 ]693 ),694 PagesAtomSitemap(695 url='{}/โsitemap_atom_0_3.xml'.format(self.TEST_BASE_URL),696 pages=[697 SitemapPage(698 url='{}/โatom_0_3_story_1.html'.format(self.TEST_BASE_URL),699 news_story=SitemapNewsStory(700 title='Test Atom 0.3 story #1',701 publish_date=self.TEST_DATE_DATETIME,702 ),703 ),704 SitemapPage(705 url='{}/โatom_0_3_story_2.html'.format(self.TEST_BASE_URL),706 news_story=SitemapNewsStory(707 title='Test Atom 0.3 story #2',708 publish_date=self.TEST_DATE_DATETIME,709 )710 )711 ]712 ),713 PagesAtomSitemap(714 url='{}/โsitemap_atom_1_0.xml'.format(self.TEST_BASE_URL),715 pages=[716 SitemapPage(717 url='{}/โatom_1_0_story_1.html'.format(self.TEST_BASE_URL),718 news_story=SitemapNewsStory(719 title='Test Atom 1.0 story #1',720 publish_date=self.TEST_DATE_DATETIME,721 ),722 ),723 SitemapPage(724 url='{}/โatom_1_0_story_2.html'.format(self.TEST_BASE_URL),725 news_story=SitemapNewsStory(726 title='Test Atom 1.0 story #2',727 publish_date=self.TEST_DATE_DATETIME,728 )729 )730 ]731 ),732 ]733 )734 ]735 )736 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)737 expected_lines = str(expected_sitemap_tree).split()738 actual_lines = str(actual_sitemap_tree).split()739 diff = difflib.ndiff(expected_lines, actual_lines)740 diff_str = '\n'.join(diff)741 assert expected_sitemap_tree == actual_sitemap_tree, diff_str742 assert len(list(actual_sitemap_tree.all_pages())) == 6743 def test_sitemap_tree_for_homepage_rss_atom_empty(self):744 """Test sitemap_tree_for_homepage() with empty RSS 2.0 /โ Atom 0.3 /โ Atom 1.0 feeds."""745 with requests_mock.Mocker() as m:746 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)747 m.get(748 self.TEST_BASE_URL + '/โ',749 text='This is a homepage.',750 )751 m.get(752 self.TEST_BASE_URL + '/โrobots.txt',753 headers={'Content-Type': 'text/โplain'},754 text=textwrap.dedent("""755 User-agent: *756 Disallow: /โwhatever757 Sitemap: {base_url}/โsitemap_rss.xml758 Sitemap: {base_url}/โsitemap_atom_0_3.xml759 Sitemap: {base_url}/โsitemap_atom_1_0.xml760 """.format(base_url=self.TEST_BASE_URL)).strip(),761 )762 # RSS 2.0 sitemap763 m.get(764 self.TEST_BASE_URL + '/โsitemap_rss.xml',765 headers={'Content-Type': 'application/โrss+xml'},766 text=textwrap.dedent("""767 <?xml version="1.0" encoding="UTF-8"?>768 <rss version="2.0">769 <channel>770 <title>Test RSS 2.0 feed</โtitle>771 <description>This is a test RSS 2.0 feed.</โdescription>772 <link>{base_url}</โlink>773 <pubDate>{pub_date}</โpubDate>774 </โchannel>775 </โrss>776 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(),777 )778 # Atom 0.3 sitemap779 m.get(780 self.TEST_BASE_URL + '/โsitemap_atom_0_3.xml',781 headers={'Content-Type': 'application/โatom+xml'},782 text=textwrap.dedent("""783 <?xml version="1.0" encoding="UTF-8"?>784 <feed version="0.3" xmlns="http:/โ/โpurl.org/โatom/โns#">785 <title>Test Atom 0.3 feed</โtitle>786 <link rel="alternate" type="text/โhtml" href="{base_url}" /โ>787 <modified>{pub_date}</โmodified>788 </โfeed>789 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),790 )791 # Atom 1.0 sitemap792 m.get(793 self.TEST_BASE_URL + '/โsitemap_atom_1_0.xml',794 headers={'Content-Type': 'application/โatom+xml'},795 text=textwrap.dedent("""796 <?xml version="1.0" encoding="UTF-8"?>797 <feed xmlns="http:/โ/โwww.w3.org/โ2005/โAtom">798 <title>Test Atom 1.0 feed</โtitle>799 <subtitle>This is a test Atom 1.0 feed.</โsubtitle>800 <link href="{base_url}/โsitemap_atom_1_0.xml" rel="self" /โ>801 <link href="{base_url}" /โ>802 <id>{base_url}</โid>803 <updated>{pub_date}</โupdated>804 </โfeed>805 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),806 )807 expected_sitemap_tree = IndexWebsiteSitemap(808 url='{}/โ'.format(self.TEST_BASE_URL),809 sub_sitemaps=[810 IndexRobotsTxtSitemap(811 url='{}/โrobots.txt'.format(self.TEST_BASE_URL),812 sub_sitemaps=[813 PagesRSSSitemap(814 url='{}/โsitemap_rss.xml'.format(self.TEST_BASE_URL),815 pages=[]816 ),817 PagesAtomSitemap(818 url='{}/โsitemap_atom_0_3.xml'.format(self.TEST_BASE_URL),819 pages=[]820 ),821 PagesAtomSitemap(822 url='{}/โsitemap_atom_1_0.xml'.format(self.TEST_BASE_URL),823 pages=[]824 ),825 ]826 )827 ]828 )829 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)830 assert expected_sitemap_tree == actual_sitemap_tree831 assert len(list(actual_sitemap_tree.all_pages())) == 0832 def test_sitemap_tree_for_homepage_prematurely_ending_xml(self):833 """Test sitemap_tree_for_homepage() with clipped XML.834 Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the835 server is unable to generate and compress a 50 MB sitemap XML. Google News doesn't seem to have a problem with836 this behavior, so we have to support this too.837 """838 with requests_mock.Mocker() as m:839 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)840 m.get(841 self.TEST_BASE_URL + '/โ',842 text='This is a homepage.',843 )844 m.get(845 self.TEST_BASE_URL + '/โrobots.txt',846 headers={'Content-Type': 'text/โplain'},847 text=textwrap.dedent("""848 User-agent: *849 Disallow: /โwhatever850 851 Sitemap: {base_url}/โsitemap.xml852 """.format(base_url=self.TEST_BASE_URL)).strip(),853 )854 m.get(855 self.TEST_BASE_URL + '/โsitemap.xml',856 text=textwrap.dedent("""857 <?xml version="1.0" encoding="UTF-8"?>858 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9"859 xmlns:news="http:/โ/โwww.google.com/โschemas/โsitemap-news/โ0.9">860 <url>861 <loc>{base_url}/โnews/โfirst.html</โloc>862 <news:news>863 <news:publication>864 <news:name>{publication_name}</โnews:name>865 <news:language>{publication_language}</โnews:language>866 </โnews:publication>867 <news:publication_date>{publication_date}</โnews:publication_date>868 <news:title>First story</โnews:title>869 </โnews:news>870 </โurl>871 <url>872 <loc>{base_url}/โnews/โsecond.html</โloc>873 <news:news>874 <news:publication>875 <news:name>{publication_name}</โnews:name>876 <news:language>{publication_language}</โnews:language>877 </โnews:publication>878 <news:publication_date>{publication_date}</โnews:publication_date>879 <news:title>Second story</โnews:title>880 </โnews:news>881 </โurl>882 883 <!-- The following story shouldn't get added as the XML ends prematurely -->884 <url>885 <loc>{base_url}/โnews/โthird.html</โloc>886 <news:news>887 <news:publication>888 <news:name>{publication_name}</โnews:name>889 <news:language>{publication_language}</โnews:language>890 </โnews:publication>891 <news:publicat892 """.format(893 base_url=self.TEST_BASE_URL,894 publication_name=self.TEST_PUBLICATION_NAME,895 publication_language=self.TEST_PUBLICATION_LANGUAGE,896 publication_date=self.TEST_DATE_STR_ISO8601,897 )).strip(),898 )899 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)900 assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)901 assert len(actual_sitemap_tree.sub_sitemaps) == 1902 assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)903 # noinspection PyUnresolvedReferences904 assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 1905 # noinspection PyUnresolvedReferences906 sitemap = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]907 assert isinstance(sitemap, PagesXMLSitemap)908 assert len(sitemap.pages) == 2909 def test_sitemap_tree_for_homepage_no_sitemap(self):910 """Test sitemap_tree_for_homepage() with no sitemaps listed in robots.txt."""911 with requests_mock.Mocker() as m:912 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)913 m.get(914 self.TEST_BASE_URL + '/โ',915 text='This is a homepage.',916 )917 m.get(918 self.TEST_BASE_URL + '/โrobots.txt',919 headers={'Content-Type': 'text/โplain'},920 text=textwrap.dedent("""921 User-agent: *922 Disallow: /โwhatever923 """.format(base_url=self.TEST_BASE_URL)).strip(),924 )925 expected_sitemap_tree = IndexWebsiteSitemap(926 url='{}/โ'.format(self.TEST_BASE_URL),927 sub_sitemaps=[928 IndexRobotsTxtSitemap(929 url='{}/โrobots.txt'.format(self.TEST_BASE_URL),930 sub_sitemaps=[],931 )932 ]933 )934 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)935 assert expected_sitemap_tree == actual_sitemap_tree936 def test_sitemap_tree_for_homepage_unpublished_sitemap(self):937 """Test sitemap_tree_for_homepage() with some sitemaps not published in robots.txt."""938 with requests_mock.Mocker() as m:939 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)940 m.get(941 self.TEST_BASE_URL + '/โ',942 text='This is a homepage.',943 )944 m.get(945 self.TEST_BASE_URL + '/โrobots.txt',946 headers={'Content-Type': 'text/โplain'},947 text=textwrap.dedent("""948 User-agent: *949 Disallow: /โwhatever950 951 Sitemap: {base_url}/โsitemap_public.xml952 """.format(base_url=self.TEST_BASE_URL)).strip(),953 )954 # Public sitemap (linked to from robots.txt)955 m.get(956 self.TEST_BASE_URL + '/โsitemap_public.xml',957 text=textwrap.dedent("""958 <?xml version="1.0" encoding="UTF-8"?>959 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9">960 <url>961 <loc>{base_url}/โnews/โpublic.html</โloc>962 </โurl>963 </โurlset>964 """.format(965 base_url=self.TEST_BASE_URL,966 publication_name=self.TEST_PUBLICATION_NAME,967 publication_language=self.TEST_PUBLICATION_LANGUAGE,968 publication_date=self.TEST_DATE_STR_ISO8601,969 )).strip(),970 )971 # Private sitemap (to be discovered by trying out a few paths)972 m.get(973 self.TEST_BASE_URL + '/โsitemap_index.xml',974 text=textwrap.dedent("""975 <?xml version="1.0" encoding="UTF-8"?>976 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9">977 <url>978 <loc>{base_url}/โnews/โprivate.html</โloc>979 </โurl>980 </โurlset>981 """.format(982 base_url=self.TEST_BASE_URL,983 publication_name=self.TEST_PUBLICATION_NAME,984 publication_language=self.TEST_PUBLICATION_LANGUAGE,985 publication_date=self.TEST_DATE_STR_ISO8601,986 )).strip(),987 )988 expected_sitemap_tree = IndexWebsiteSitemap(989 url='{}/โ'.format(self.TEST_BASE_URL),990 sub_sitemaps=[991 IndexRobotsTxtSitemap(992 url='{}/โrobots.txt'.format(self.TEST_BASE_URL),993 sub_sitemaps=[994 PagesXMLSitemap(995 url='{}/โsitemap_public.xml'.format(self.TEST_BASE_URL),996 pages=[997 SitemapPage(998 url='{}/โnews/โpublic.html'.format(self.TEST_BASE_URL),999 ),1000 ],1001 ),1002 ],1003 ),1004 PagesXMLSitemap(1005 url='{}/โsitemap_index.xml'.format(self.TEST_BASE_URL),1006 pages=[1007 SitemapPage(1008 url='{}/โnews/โprivate.html'.format(self.TEST_BASE_URL),1009 ),1010 ],1011 ),1012 ]1013 )1014 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1015 assert expected_sitemap_tree == actual_sitemap_tree1016 def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self):1017 """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt."""1018 with requests_mock.Mocker() as m:1019 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1020 m.get(1021 self.TEST_BASE_URL + '/โ',1022 text='This is a homepage.',1023 )1024 m.get(1025 self.TEST_BASE_URL + '/โrobots.txt',1026 headers={'Content-Type': ''},1027 text=textwrap.dedent("""1028 User-agent: *1029 Disallow: /โwhatever1030 """.format(base_url=self.TEST_BASE_URL)).strip(),1031 )1032 expected_sitemap_tree = IndexWebsiteSitemap(1033 url='{}/โ'.format(self.TEST_BASE_URL),1034 sub_sitemaps=[1035 IndexRobotsTxtSitemap(1036 url='{}/โrobots.txt'.format(self.TEST_BASE_URL),1037 sub_sitemaps=[],1038 )1039 ]1040 )1041 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1042 assert expected_sitemap_tree == actual_sitemap_tree1043 def test_sitemap_tree_for_homepage_no_robots_txt(self):1044 """Test sitemap_tree_for_homepage() with no robots.txt."""1045 with requests_mock.Mocker() as m:1046 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1047 m.get(1048 self.TEST_BASE_URL + '/โ',1049 text='This is a homepage.',1050 )1051 # Nonexistent robots.txt1052 m.get(1053 self.TEST_BASE_URL + '/โrobots.txt',1054 status_code=404,1055 reason='Not Found',1056 headers={'Content-Type': 'text/โhtml'},1057 text="<h1>404 Not Found!</โh1>",1058 )1059 expected_sitemap_tree = IndexWebsiteSitemap(1060 url='{}/โ'.format(self.TEST_BASE_URL),1061 sub_sitemaps=[1062 InvalidSitemap(1063 url='{}/โrobots.txt'.format(self.TEST_BASE_URL),1064 reason=(1065 'Unable to fetch sitemap from {base_url}/โrobots.txt: 404 Not Found'1066 ).format(base_url=self.TEST_BASE_URL),1067 )1068 ]1069 )1070 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1071 assert expected_sitemap_tree == actual_sitemap_tree1072 def test_sitemap_tree_for_homepage_huge_sitemap(self):1073 """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling)."""1074 page_count = 10001075 sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?>1076 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9"1077 xmlns:news="http:/โ/โwww.google.com/โschemas/โsitemap-news/โ0.9"1078 xmlns:xhtml="http:/โ/โwww.w3.org/โ1999/โxhtml">1079 """1080 for x in range(page_count):1081 sitemap_xml += """1082 <url>1083 <loc>{base_url}/โnews/โpage_{x}.html</โloc>1084 <!-- Element present but empty -->1085 <lastmod /โ>1086 <!-- Some other XML namespace -->1087 <xhtml:link rel="alternate"1088 media="only screen and (max-width: 640px)"1089 href="{base_url}/โnews/โpage_{x}.html?mobile=1" /โ>1090 <news:news>1091 <news:publication>1092 <news:name>{publication_name}</โnews:name>1093 <news:language>{publication_language}</โnews:language>1094 </โnews:publication>1095 <news:publication_date>{publication_date}</โnews:publication_date>1096 <news:title>Foo <foo></โnews:title> <!-- HTML entity decoding -->1097 </โnews:news>1098 </โurl>1099 """.format(1100 x=x,1101 base_url=self.TEST_BASE_URL,1102 publication_name=self.TEST_PUBLICATION_NAME,1103 publication_language=self.TEST_PUBLICATION_LANGUAGE,1104 publication_date=self.TEST_DATE_STR_ISO8601,1105 )1106 sitemap_xml += "</โurlset>"1107 with requests_mock.Mocker() as m:1108 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1109 m.get(1110 self.TEST_BASE_URL + '/โ',1111 text='This is a homepage.',1112 )1113 m.get(1114 self.TEST_BASE_URL + '/โrobots.txt',1115 headers={'Content-Type': 'text/โplain'},1116 text=textwrap.dedent("""1117 User-agent: *1118 Disallow: /โwhatever1119 1120 Sitemap: {base_url}/โsitemap.xml.gz1121 """.format(base_url=self.TEST_BASE_URL)).strip(),1122 )1123 m.get(1124 self.TEST_BASE_URL + '/โsitemap.xml.gz',1125 headers={'Content-Type': 'application/โx-gzip'},1126 content=gzip(sitemap_xml),1127 )1128 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1129 assert len(list(actual_sitemap_tree.all_pages())) == page_count1130 def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self):1131 """Test sitemap_tree_for_homepage() with weird (but valid) spacing."""1132 with requests_mock.Mocker() as m:1133 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1134 m.get(1135 self.TEST_BASE_URL + '/โ',1136 text='This is a homepage.',1137 )1138 robots_txt_body = ""1139 robots_txt_body += "User-agent: *\n"1140 # Extra space before "Sitemap:", no space after "Sitemap:", and extra space after sitemap URL1141 robots_txt_body += " Sitemap:{base_url}/โsitemap.xml ".format(base_url=self.TEST_BASE_URL)1142 m.get(1143 self.TEST_BASE_URL + '/โrobots.txt',1144 headers={'Content-Type': 'text/โplain'},1145 text=robots_txt_body,1146 )1147 m.get(1148 self.TEST_BASE_URL + '/โsitemap.xml',1149 text=textwrap.dedent("""1150 <?xml version="1.0" encoding="UTF-8"?>1151 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9"1152 xmlns:news="http:/โ/โwww.google.com/โschemas/โsitemap-news/โ0.9">1153 <url>1154 <loc>{base_url}/โnews/โfirst.html</โloc>1155 <news:news>1156 <news:publication>1157 <news:name>{publication_name}</โnews:name>1158 <news:language>{publication_language}</โnews:language>1159 </โnews:publication>1160 <news:publication_date>{publication_date}</โnews:publication_date>1161 <news:title>First story</โnews:title>1162 </โnews:news>1163 </โurl>1164 </โurlset>1165 """.format(1166 base_url=self.TEST_BASE_URL,1167 publication_name=self.TEST_PUBLICATION_NAME,1168 publication_language=self.TEST_PUBLICATION_LANGUAGE,1169 publication_date=self.TEST_DATE_STR_ISO8601,1170 )).strip(),1171 )1172 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1173 assert len(list(actual_sitemap_tree.all_pages())) == 11174 def test_sitemap_tree_for_homepage_utf8_bom(self):1175 """Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap."""1176 robots_txt_body = textwrap.dedent("""1177 User-agent: *1178 Disallow: /โwhatever1179 Sitemap: {base_url}/โsitemap.xml1180 """.format(base_url=self.TEST_BASE_URL)).strip()1181 sitemap_xml_body = textwrap.dedent("""1182 <?xml version="1.0" encoding="UTF-8"?>1183 <urlset xmlns="http:/โ/โwww.sitemaps.org/โschemas/โsitemap/โ0.9"1184 xmlns:news="http:/โ/โwww.google.com/โschemas/โsitemap-news/โ0.9">1185 <url>1186 <loc>{base_url}/โnews/โfirst.html</โloc>1187 <news:news>1188 <news:publication>1189 <news:name>{publication_name}</โnews:name>1190 <news:language>{publication_language}</โnews:language>1191 </โnews:publication>1192 <news:publication_date>{publication_date}</โnews:publication_date>1193 <news:title>First story</โnews:title>1194 </โnews:news>1195 </โurl>1196 </โurlset>1197 """.format(1198 base_url=self.TEST_BASE_URL,1199 publication_name=self.TEST_PUBLICATION_NAME,1200 publication_language=self.TEST_PUBLICATION_LANGUAGE,1201 publication_date=self.TEST_DATE_STR_ISO8601,1202 )).strip()1203 robots_txt_body_encoded = robots_txt_body.encode('utf-8-sig')1204 sitemap_xml_body_encoded = sitemap_xml_body.encode('utf-8-sig')1205 with requests_mock.Mocker() as m:1206 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1207 m.get(1208 self.TEST_BASE_URL + '/โ',1209 text='This is a homepage.',1210 )1211 m.get(1212 self.TEST_BASE_URL + '/โrobots.txt',1213 headers={'Content-Type': 'text/โplain'},1214 content=robots_txt_body_encoded,1215 )1216 m.get(1217 self.TEST_BASE_URL + '/โsitemap.xml',1218 content=sitemap_xml_body_encoded,1219 )1220 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)...
combinations.py
Source: combinations.py
1'''Test cases arguments combinations.'''2import os3import sys4import inflection5TEST_DIR = os.path.abspath(os.path.dirname(__file__))6if TEST_DIR not in sys.path:7 sys.path.append(TEST_DIR)8from consts import TEMPDIR, TEST_BASE_URL # noqa: E4029from http_request_codegen.hrc_string import replace_multiple # noqa: E40210def argument_combination_to_filename(combination_name, index):11 return '{}.{}.expect.txt'.format(12 str(index).zfill(3),13 inflection.parameterize(14 replace_multiple(15 combination_name, replacements={16 '"': '-double-quote-',17 '\'': '-single-quote-',18 },19 ),20 ),21 )22def combination_arguments_to_kwargs(arguments):23 kwargs = {}24 for key, value in arguments.items():25 if key == 'kwargs':26 kwargs.update(value)27 else:28 kwargs[key] = value29 return kwargs30def get_argument_combinations(31 method='GET', include_filenames=True,32 dirpath=None,33):34 response = [35 {36 'name': 'URL',37 'arguments': {38 'url': TEST_BASE_URL,39 },40 },41 {42 'name': 'URL wrapping (no wrap)',43 'arguments': {44 'url': TEST_BASE_URL,45 'wrap': 99999,46 },47 },48 {49 'name': 'URL wrapping (wrap 15)',50 'arguments': {51 'url': TEST_BASE_URL,52 'wrap': 15,53 },54 },55 {56 'name': 'Parameter',57 'arguments': {58 'url': TEST_BASE_URL,59 'parameters': [60 {61 'name': 'param-1',62 'value': 'value-1',63 },64 ],65 },66 },67 {68 'name': 'Parameters',69 'arguments': {70 'url': TEST_BASE_URL,71 'parameters': [72 {73 'name': 'param-1',74 'value': 'foo',75 },76 {77 'name': 'param-2',78 'value': 1,79 },80 {81 'name': 'param-3',82 'value': .777,83 },84 {85 'name': 'param-4',86 'value': True,87 },88 ],89 },90 },91 {92 'name': 'Parameter wrapping value',93 'arguments': {94 'url': TEST_BASE_URL,95 'parameters': [96 {97 'name': 'param-1',98 'value': 'foo-bar-baz' * 50,99 },100 ],101 },102 },103 {104 'name': 'Parameters, one wrapping value',105 'arguments': {106 'url': TEST_BASE_URL,107 'parameters': [108 {109 'name': 'param-1',110 'value': 'foo-bar-baz' * 50,111 },112 {113 'name': 'param-2',114 'value': 'value-2',115 },116 ],117 },118 },119 {120 'name': 'Parameter escaping quotes',121 'arguments': {122 'url': TEST_BASE_URL,123 'parameters': [124 {125 'name': 'param-1-with-\'\'-quotes',126 'value': 'value-1-with-\'\'-quotes',127 },128 ],129 },130 },131 {132 'name': 'URL + header',133 'arguments': {134 'url': TEST_BASE_URL,135 'headers': {136 'Content-Type': 'application/โjson',137 },138 },139 },140 {141 'name': 'URL + headers',142 'arguments': {143 'url': TEST_BASE_URL,144 'headers': {145 'Content-Type': 'application/โjson',146 'Accept-Language': 'es',147 },148 },149 },150 {151 'name': 'URL + header wrapping value',152 'arguments': {153 'url': TEST_BASE_URL,154 'headers': {155 'Content-Type': 'application/โjson' * 5,156 },157 },158 },159 {160 'name': 'URL + headers, one wrapping value',161 'arguments': {162 'url': TEST_BASE_URL,163 'headers': {164 'Content-Type': 'application/โjson' * 5,165 'Accept-Language': '*',166 },167 },168 },169 {170 'name': 'URL + header escaping quotes',171 'arguments': {172 'url': TEST_BASE_URL,173 'headers': {174 'Accept-Language': 'Header value with \'\' quotes',175 },176 },177 },178 {179 'name': 'URL + kwarg',180 'arguments': {181 'url': TEST_BASE_URL,182 'kwargs': {183 'timeout': 5,184 },185 },186 },187 {188 'name': 'URL + kwargs',189 'arguments': {190 'url': TEST_BASE_URL,191 'kwargs': {192 'timeout': 5,193 'stream': True,194 },195 },196 },197 {198 'name': 'URL + kwarg escaping quotes',199 'arguments': {200 'url': TEST_BASE_URL,201 'kwargs': {202 'cookies': {203 'foo': 'value with \'\' quotes',204 },205 },206 },207 },208 {209 'name': 'URL + kwarg wrapping value',210 'arguments': {211 'url': TEST_BASE_URL,212 'kwargs': {213 'cookies': {214 'bar': 'foo bar baz ' * 50,215 },216 },217 },218 },219 {220 'name': 'URL + kwargs, one wrapping value',221 'arguments': {222 'url': TEST_BASE_URL,223 'kwargs': {224 'cookies': {225 'bar': 'foo bar baz ' * 50,226 },227 'stream': True,228 },229 },230 },231 {232 'name': 'Parameter + header',233 'arguments': {234 'url': TEST_BASE_URL,235 'parameters': [236 {237 'name': 'param-1',238 'value': 'value-1',239 },240 ],241 'headers': {242 'Content-Type': 'application/โjson',243 },244 },245 },246 {247 'name': 'Parameter + header (oneline)',248 'arguments': {249 'url': TEST_BASE_URL,250 'parameters': [251 {252 'name': 'param-1',253 'value': 'value-1',254 },255 ],256 'headers': {257 'Content-Type': 'application/โjson',258 },259 'oneline': True,260 },261 },262 {263 'name': 'Parameters + header',264 'arguments': {265 'url': TEST_BASE_URL,266 'parameters': [267 {268 'name': 'param-1',269 'value': 'value-1',270 },271 {272 'name': 'param-2',273 'value': 'value-2',274 },275 ],276 'headers': {277 'Content-Type': 'application/โjson',278 },279 },280 },281 {282 'name': 'Parameter + headers',283 'arguments': {284 'url': TEST_BASE_URL,285 'parameters': [286 {287 'name': 'param-1',288 'value': 'value-1',289 },290 ],291 'headers': {292 'Content-Type': 'application/โjson',293 'Accept-Language': '*',294 },295 },296 },297 {298 'name': 'Parameters + headers',299 'arguments': {300 'url': TEST_BASE_URL,301 'parameters': [302 {303 'name': 'param-1',304 'value': 'value-1',305 },306 {307 'name': 'param-2',308 'value': 'value-2',309 },310 ],311 'headers': {312 'Content-Type': 'application/โjson',313 'Accept-Language': '*',314 },315 },316 },317 {318 'name': 'Parameter + kwarg',319 'arguments': {320 'url': TEST_BASE_URL,321 'parameters': [322 {323 'name': 'param-1',324 'value': 'value-1',325 },326 ],327 'kwargs': {328 'timeout': 10,329 },330 },331 },332 {333 'name': 'Parameter + kwarg (oneline)',334 'arguments': {335 'url': TEST_BASE_URL,336 'parameters': [337 {338 'name': 'a',339 'value': 'b',340 },341 ],342 'kwargs': {343 'timeout': 10,344 },345 'oneline': True,346 },347 },348 {349 'name': 'Parameters + kwarg',350 'arguments': {351 'url': TEST_BASE_URL,352 'parameters': [353 {354 'name': 'param-1',355 'value': 'value-1',356 },357 {358 'name': 'param-2',359 'value': 'value-2',360 },361 ],362 'kwargs': {363 'timeout': 10,364 },365 },366 },367 {368 'name': 'Parameter + kwargs',369 'arguments': {370 'url': TEST_BASE_URL,371 'parameters': [372 {373 'name': 'param-1',374 'value': 'value-1',375 },376 ],377 'kwargs': {378 'timeout': 10,379 'stream': True,380 },381 },382 },383 {384 'name': 'Parameters + kwargs',385 'arguments': {386 'url': TEST_BASE_URL,387 'parameters': [388 {389 'name': 'param-1',390 'value': 'value-1',391 },392 {393 'name': 'param-2',394 'value': 'value-2',395 },396 ],397 'kwargs': {398 'timeout': 10,399 'stream': True,400 },401 },402 },403 {404 'name': 'URL + header + kwarg',405 'arguments': {406 'url': TEST_BASE_URL,407 'headers': {408 'Content-Type': 'application/โjson',409 },410 'kwargs': {411 'timeout': 5,412 },413 },414 },415 {416 'name': 'URL + header + kwarg (oneline)',417 'arguments': {418 'url': TEST_BASE_URL,419 'headers': {420 'Content-Type': 'application/โjson',421 },422 'kwargs': {423 'timeout': 5,424 },425 'oneline': True,426 },427 },428 {429 'name': 'URL + headers + kwarg',430 'arguments': {431 'url': TEST_BASE_URL,432 'headers': {433 'Content-Type': 'application/โjson',434 'Accept-Language': '*',435 },436 'kwargs': {437 'timeout': 5,438 },439 },440 },441 {442 'name': 'URL + header + kwargs',443 'arguments': {444 'url': TEST_BASE_URL,445 'headers': {446 'Accept-Language': '*',447 },448 'kwargs': {449 'timeout': 5,450 'stream': False,451 },452 },453 },454 {455 'name': 'URL + headers + kwargs',456 'arguments': {457 'url': TEST_BASE_URL,458 'headers': {459 'Content-Type': 'application/โjson',460 'Accept-Language': '*',461 },462 'kwargs': {463 'timeout': 5,464 'stream': False,465 },466 },467 },468 {469 'name': 'Parameter + header + kwarg',470 'arguments': {471 'url': TEST_BASE_URL,472 'parameters': [473 {474 'name': 'param-1',475 'value': 'value-1',476 },477 ],478 'headers': {479 'Content-Type': 'application/โjson',480 },481 'kwargs': {482 'timeout': 5,483 },484 },485 },486 {487 'name': 'Parameter + header + kwargs',488 'arguments': {489 'url': TEST_BASE_URL,490 'parameters': [491 {492 'name': 'param-1',493 'value': 'value-1',494 },495 ],496 'headers': {497 'Content-Type': 'application/โjson',498 },499 'kwargs': {500 'timeout': 5,501 'stream': True,502 },503 },504 },505 {506 'name': 'Parameters + header + kwarg',507 'arguments': {508 'url': TEST_BASE_URL,509 'parameters': [510 {511 'name': 'param-1',512 'value': 'value-1',513 },514 {515 'name': 'param-2',516 'value': 7.77,517 },518 ],519 'headers': {520 'Content-Type': 'application/โjson',521 },522 'kwargs': {523 'timeout': 5,524 },525 },526 },527 {528 'name': 'Parameters + header + kwargs',529 'arguments': {530 'url': TEST_BASE_URL,531 'parameters': [532 {533 'name': 'param-1',534 'value': 'value-1',535 },536 {537 'name': 'param-2',538 'value': 7.77,539 },540 ],541 'headers': {542 'Content-Type': 'application/โjson',543 },544 'kwargs': {545 'timeout': 5,546 'stream': False,547 },548 },549 },550 {551 'name': 'Parameters + headers + kwarg',552 'arguments': {553 'url': TEST_BASE_URL,554 'parameters': [555 {556 'name': 'param-1',557 'value': 'value-1',558 },559 {560 'name': 'param-2',561 'value': 7.77,562 },563 ],564 'headers': {565 'Content-Type': 'application/โjson',566 'Accept-Language': 'fr',567 },568 'kwargs': {569 'timeout': 5,570 },571 },572 },573 {574 'name': 'Parameters + headers + kwargs',575 'arguments': {576 'url': TEST_BASE_URL,577 'parameters': [578 {579 'name': 'param-1',580 'value': 'value-1',581 },582 {583 'name': 'param-2',584 'value': 7.77,585 },586 ],587 'headers': {588 'Content-Type': 'application/โjson',589 'Accept-Language': 'fr',590 },591 'kwargs': {592 'timeout': 5,593 'stream': True,594 },595 },596 },597 {598 'name': 'Setup',599 'arguments': {600 'url': TEST_BASE_URL,601 'setup': True,602 },603 },604 {605 'name': 'No setup',606 'arguments': {607 'url': TEST_BASE_URL,608 'setup': False,609 },610 },611 {612 'name': 'Custom setup',613 'arguments': {614 'url': TEST_BASE_URL,615 'setup': 'custom_setup=1\n\n',616 },617 },618 {619 'name': 'Custom teardown',620 'arguments': {621 'url': TEST_BASE_URL,622 'teardown': '\n\ncustom_teardown=1',623 },624 },625 {626 'name': 'Quote character \'',627 'arguments': {628 'url': TEST_BASE_URL,629 'quote_char': '\'',630 },631 },632 {633 'name': 'Quote character "',634 'arguments': {635 'url': TEST_BASE_URL,636 'quote_char': '"',637 },638 },639 {640 'name': 'Indent 2 spaces',641 'arguments': {642 'url': TEST_BASE_URL,643 'indent': ' ',644 'headers': {645 'Accept-Language': 'es en fr * ' * 20,646 },647 },648 },649 {650 'name': 'Indent 4 spaces',651 'arguments': {652 'url': TEST_BASE_URL,653 'indent': ' ',654 'headers': {655 'Accept-Language': 'es en fr * ' * 20,656 },657 },658 },659 {660 'name': 'One line',661 'arguments': {662 'url': TEST_BASE_URL,663 'oneline': True,664 },665 },666 {667 'name': 'One line + no setup',668 'arguments': {669 'url': TEST_BASE_URL,670 'oneline': True,671 'setup': False,672 },673 },674 {675 'name': 'Wrap 0',676 'arguments': {677 'url': TEST_BASE_URL,678 'wrap': 0,679 },680 },681 {682 'name': 'Wrap 1',683 'arguments': {684 'url': TEST_BASE_URL,685 'wrap': 1,686 },687 },688 {689 'name': 'Wrap 10',690 'arguments': {691 'url': TEST_BASE_URL,692 'wrap': 10,693 },694 },695 {696 'name': 'Wrap 20',697 'arguments': {698 'url': TEST_BASE_URL,699 'wrap': 20,700 },701 },702 {703 'name': 'Wrap 25',704 'arguments': {705 'url': TEST_BASE_URL,706 'wrap': 25,707 },708 },709 {710 'name': 'Wrap 30',711 'arguments': {712 'url': TEST_BASE_URL,713 'wrap': 30,714 },715 },716 {717 'name': 'Wrap 35',718 'arguments': {719 'url': TEST_BASE_URL,720 'wrap': 35,721 },722 },723 {724 'name': 'Wrap 40',725 'arguments': {726 'url': TEST_BASE_URL,727 'wrap': 40,728 },729 },730 {731 'name': 'Wrap infinite',732 'arguments': {733 'url': TEST_BASE_URL,734 'wrap': float('inf'),735 },736 },737 {738 'name': 'Wrap null is infinite',739 'arguments': {740 'url': TEST_BASE_URL,741 'wrap': None,742 },743 },744 ]745 if method.lower() == 'post':746 response.extend([747 {748 'name': 'Data by parameter (text/โplain)',749 'arguments': {750 'url': TEST_BASE_URL,751 'parameters': [752 {753 'name': '',754 'value': 'foo bar baz ' * 3,755 },756 ],757 'headers': {758 'Content-Type': 'text/โplain',759 },760 },761 },762 {763 'name': 'Data by parameter (text/โplain) wrapping value',764 'arguments': {765 'url': TEST_BASE_URL,766 'parameters': [767 {768 'name': '',769 'value': 'foo bar baz ' * 30,770 },771 ],772 'headers': {773 'Content-Type': 'text/โplain',774 },775 },776 },777 {778 'name': 'Data by parameter (application/โjson)',779 'arguments': {780 'url': TEST_BASE_URL,781 'parameters': [782 {783 'name': 'param-1',784 'value': 'value-1',785 },786 ],787 'headers': {788 'Content-Type': 'application/โjson',789 },790 },791 },792 {793 'name': 'Data by parameters (application/โjson)',794 'arguments': {795 'url': TEST_BASE_URL,796 'parameters': [797 {798 'name': 'param-int',799 'value': 1,800 },801 {802 'name': 'param-float',803 'value': .777,804 },805 {806 'name': 'param-bool',807 'value': True,808 },809 ],810 'headers': {811 'Content-Type': 'application/โjson',812 },813 },814 },815 {816 'name': (817 'Data by parameter'818 ' (application/โx-www-form-urlencoded)'819 ),820 'arguments': {821 'url': TEST_BASE_URL,822 'parameters': [823 {824 'name': 'param-1',825 'value': 'value-1',826 },827 ],828 'headers': {829 'Content-Type': 'application/โx-www-form-urlencoded',830 },831 },832 },833 {834 'name': (835 'Data by parameters'836 ' (application/โx-www-form-urlencoded)'837 ),838 'arguments': {839 'url': TEST_BASE_URL,840 'parameters': [841 {842 'name': 'param-int',843 'value': 1,844 },845 {846 'name': 'param-float',847 'value': .777,848 },849 {850 'name': 'param-bool',851 'value': True,852 },853 ],854 'headers': {855 'Content-Type': 'application/โx-www-form-urlencoded',856 },857 },858 },859 {860 'name': 'File by filepath (multipart/โform-data)',861 'arguments': {862 'url': TEST_BASE_URL,863 'files': {864 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),865 },866 },867 },868 {869 'name': 'Files by filepath (multipart/โform-data)',870 'arguments': {871 'url': TEST_BASE_URL,872 'files': {873 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),874 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),875 },876 },877 },878 {879 'name': 'File by filepath (multipart/โform-data) wrapping',880 'arguments': {881 'url': TEST_BASE_URL,882 'files': {883 'param-1': os.path.join(884 TEMPDIR, '%s.ext' % ('foo' * 40),885 ),886 },887 },888 },889 {890 'name': (891 'Files by filepath (multipart/โform-data)'892 ' with Content-Type'893 ),894 'arguments': {895 'url': TEST_BASE_URL,896 'files': {897 'param-1': (898 os.path.join(TEMPDIR, 'file-1.ext'),899 'text/โplain',900 ),901 'param-2': (902 os.path.join(TEMPDIR, 'file-2.ext'),903 'text/โcsv',904 ),905 },906 },907 },908 {909 'name': (910 'File by filepath (multipart/โform-data)'911 ' with Content-Type wrapping'912 ),913 'arguments': {914 'url': TEST_BASE_URL,915 'files': {916 'param-1': (917 os.path.join(TEMPDIR, 'file-1.ext'),918 'text/โplain ' * 20,919 ),920 },921 },922 },923 {924 'name': (925 'File by filepath (multipart/โform-data),'926 ' Content-Type, header'927 ),928 'arguments': {929 'url': TEST_BASE_URL,930 'files': {931 'param-1': (932 os.path.join(TEMPDIR, 'file-1.ext'),933 'text/โplain',934 {'Accept-Language': 'es'},935 ),936 },937 },938 },939 {940 'name': (941 'File by filepath (multipart/โform-data),'942 ' Content-Type, headers'943 ),944 'arguments': {945 'url': TEST_BASE_URL,946 'files': {947 'param-1': (948 os.path.join(TEMPDIR, 'file-1.ext'),949 'text/โplain',950 {951 'Accept-Language': 'es',952 'Accept-Charset': 'utf-8',953 },954 ),955 },956 },957 },958 {959 'name': 'Files by filepath (multipart/โform-data) + parameter',960 'arguments': {961 'url': TEST_BASE_URL,962 'files': {963 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),964 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),965 },966 'parameters': [967 {968 'name': 'param-1',969 'value': 'value-1',970 },971 ],972 },973 },974 {975 'name': 'Files by filepath (multipart/โform-data) + parameters',976 'arguments': {977 'url': TEST_BASE_URL,978 'files': {979 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),980 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),981 },982 'parameters': [983 {984 'name': 'param-1',985 'value': 'value-1',986 },987 {988 'name': 'param-2',989 'value': 'value-2',990 },991 ],992 },993 },994 {995 'name': (996 'Files by filepath (multipart/โform-data) + parameter'997 ' + header'998 ),999 'arguments': {1000 'url': TEST_BASE_URL,1001 'files': {1002 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1003 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1004 },1005 'parameters': [1006 {1007 'name': 'param-1',1008 'value': 'value-1',1009 },1010 ],1011 'headers': {1012 'Accept-Language': 'fr',1013 },1014 },1015 },1016 {1017 'name': (1018 'Files by filepath (multipart/โform-data) + parameter'1019 ' + headers'1020 ),1021 'arguments': {1022 'url': TEST_BASE_URL,1023 'files': {1024 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1025 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1026 },1027 'parameters': [1028 {1029 'name': 'param-1',1030 'value': 'value-1',1031 },1032 ],1033 'headers': {1034 'Accept-Language': 'fr',1035 'Accept-Charset': 'utf-8',1036 },1037 },1038 },1039 {1040 'name': (1041 'Files by filepath (multipart/โform-data) + parameters'1042 ' + header'1043 ),1044 'arguments': {1045 'url': TEST_BASE_URL,1046 'files': {1047 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1048 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1049 },1050 'parameters': [1051 {1052 'name': 'param-1',1053 'value': 'value-1',1054 },1055 {1056 'name': 'param-2',1057 'value': 'value-2',1058 },1059 ],1060 'headers': {1061 'Accept-Language': 'es',1062 },1063 },1064 },1065 {1066 'name': (1067 'Files by filepath (multipart/โform-data) + parameters'1068 ' + headers'1069 ),1070 'arguments': {1071 'url': TEST_BASE_URL,1072 'files': {1073 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1074 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1075 },1076 'parameters': [1077 {1078 'name': 'param-1',1079 'value': 'value-1',1080 },1081 {1082 'name': 'param-2',1083 'value': 'value-2',1084 },1085 ],1086 'headers': {1087 'Accept-Language': 'fr',1088 'Accept-Charset': 'utf-8',1089 },1090 },1091 },1092 {1093 'name': (1094 'Files by filepath (multipart/โform-data) + parameter'1095 ' + header + kwarg'1096 ),1097 'arguments': {1098 'url': TEST_BASE_URL,1099 'files': {1100 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1101 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1102 },1103 'parameters': [1104 {1105 'name': 'param-1',1106 'value': 'value-1',1107 },1108 ],1109 'headers': {1110 'Accept-Language': 'fr',1111 },1112 'kwargs': {1113 'timeout': 10,1114 },1115 },1116 },1117 {1118 'name': (1119 'Files by filepath (multipart/โform-data) + parameter'1120 ' + headers + kwarg'1121 ),1122 'arguments': {1123 'url': TEST_BASE_URL,1124 'files': {1125 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1126 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1127 },1128 'parameters': [1129 {1130 'name': 'param-1',1131 'value': 'value-1',1132 },1133 ],1134 'headers': {1135 'Accept-Language': 'fr',1136 'Accept-Charset': 'utf-8',1137 },1138 'kwargs': {1139 'timeout': 10,1140 },1141 },1142 },1143 {1144 'name': (1145 'Files by filepath (multipart/โform-data) + parameters'1146 ' + header + kwarg'1147 ),1148 'arguments': {1149 'url': TEST_BASE_URL,1150 'files': {1151 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1152 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1153 },1154 'parameters': [1155 {1156 'name': 'param-1',1157 'value': 'value-1',1158 },1159 {1160 'name': 'param-2',1161 'value': 'value-2',1162 },1163 ],1164 'headers': {1165 'Accept-Language': 'fr',1166 },1167 'kwargs': {1168 'timeout': 10,1169 },1170 },1171 },1172 {1173 'name': (1174 'Files by filepath (multipart/โform-data) + parameters'1175 ' + headers + kwarg'1176 ),1177 'arguments': {1178 'url': TEST_BASE_URL,1179 'files': {1180 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1181 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1182 },1183 'parameters': [1184 {1185 'name': 'param-1',1186 'value': 'value-1',1187 },1188 {1189 'name': 'param-2',1190 'value': 'value-2',1191 },1192 ],1193 'headers': {1194 'Accept-Language': 'fr',1195 'Accept-Charset': 'utf-8',1196 },1197 'kwargs': {1198 'timeout': 10,1199 },1200 },1201 },1202 {1203 'name': (1204 'Files by filepath (multipart/โform-data) + parameter'1205 ' + header + kwargs'1206 ),1207 'arguments': {1208 'url': TEST_BASE_URL,1209 'files': {1210 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1211 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1212 },1213 'parameters': [1214 {1215 'name': 'param-1',1216 'value': 'value-1',1217 },1218 ],1219 'headers': {1220 'Accept-Language': 'fr',1221 },1222 'kwargs': {1223 'timeout': 10,1224 'cookies': {1225 'hello': 'world',1226 },1227 },1228 },1229 },1230 {1231 'name': (1232 'Files by filepath (multipart/โform-data) + parameter'1233 ' + headers + kwargs'1234 ),1235 'arguments': {1236 'url': TEST_BASE_URL,1237 'files': {1238 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1239 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1240 },1241 'parameters': [1242 {1243 'name': 'param-1',1244 'value': 'value-1',1245 },1246 ],1247 'headers': {1248 'Accept-Language': 'fr',1249 'Accept-Charset': 'utf-8',1250 },1251 'kwargs': {1252 'timeout': 10,1253 'stream': False,1254 },1255 },1256 },1257 {1258 'name': (1259 'Files by filepath (multipart/โform-data) + parameters'1260 ' + header + kwargs'1261 ),1262 'arguments': {1263 'url': TEST_BASE_URL,1264 'files': {1265 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1266 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1267 },1268 'parameters': [1269 {1270 'name': 'param-1',1271 'value': 'value-1',1272 },1273 {1274 'name': 'param-2',1275 'value': 'value-2',1276 },1277 ],1278 'headers': {1279 'Accept-Language': 'fr',1280 },1281 'kwargs': {1282 'timeout': 10,1283 'stream': False,1284 },1285 },1286 },1287 {1288 'name': (1289 'Files by filepath (multipart/โform-data) + parameters'1290 ' + headers + kwargs'1291 ),1292 'arguments': {1293 'url': TEST_BASE_URL,1294 'files': {1295 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1296 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1297 },1298 'parameters': [1299 {1300 'name': 'param-1',1301 'value': 'value-1',1302 },1303 {1304 'name': 'param-2',1305 'value': 'value-2',1306 },1307 ],1308 'headers': {1309 'Accept-Language': 'fr',1310 'Accept-Charset': 'utf-8',1311 },1312 'kwargs': {1313 'timeout': 10,1314 'stream': False,1315 },1316 },1317 },1318 {1319 'name': (1320 'No setup + files by filepath (multipart/โform-data)'1321 ' + parameters + headers + kwargs + '1322 ),1323 'arguments': {1324 'url': TEST_BASE_URL,1325 'setup': False,1326 'files': {1327 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1328 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1329 },1330 'parameters': [1331 {1332 'name': 'param-1',1333 'value': 'value-1',1334 },1335 {1336 'name': 'param-2',1337 'value': 'value-2',1338 },1339 ],1340 'headers': {1341 'Accept-Language': 'fr',1342 'Accept-Charset': 'utf-8',1343 },1344 'kwargs': {1345 'timeout': 10,1346 'stream': False,1347 },1348 },1349 },1350 ])1351 if include_filenames:1352 for index, args_group in enumerate(response):1353 fname = argument_combination_to_filename(1354 args_group['name'], index,1355 )1356 if dirpath and os.path.exists(dirpath):1357 fname = os.path.join(dirpath, fname)1358 args_group['filename'] = fname...
test_logoscraper.py
Source: test_logoscraper.py
1from bs4 import BeautifulSoup2import scraper.logoscraper as logoscraper3test_base_url = "https:/โ/โwww.testbase.com"4def test_get_logo_should_pass():5 htmls = [6 f"<div class='logo'><img src='https:/โ/โwww.test.com'></โimg></โdiv>",7 f"<div id='logo'><img src='https:/โ/โwww.test.com'></โimg></โdiv>",8 f"<a><img src='https:/โ/โwww.test.com'></โimg></โa>",9 f"<div><img src='https:/โ/โwww.test.com'></โimg></โdiv>",10 f"<a href={test_base_url}><img src='https:/โ/โwww.test.com'></โimg></โdiv>",11 ]12 for html in htmls:13 assert (14 logoscraper.get_logo(BeautifulSoup(html, "html.parser"), test_base_url)15 == "https:/โ/โwww.test.com"16 )17def test_find_image_tag():18 result = logoscraper.find_img_tag(19 BeautifulSoup(20 f'<a><img src="https:/โ/โ{test_base_url}"></โimg></โa>', "html.parser"21 ),22 test_base_url,23 )24 assert result25def test_find_image_tag_return_itself():26 result = logoscraper.find_img_tag(27 BeautifulSoup(f'<img src="https:/โ/โ{test_base_url}"></โimg>', "html.parser"),28 test_base_url,29 )30 assert result31def test_find_image_tag_retrun_none():32 result = logoscraper.find_img_tag(33 BeautifulSoup(f'<a href="https:/โ/โ{test_base_url}"></โa>', "html.parser"),34 test_base_url,35 )36 assert result is None37def test_format_image_source():38 html = BeautifulSoup(f"<img src='{test_base_url}'></โimg>", "html.parser").find(39 "img"40 )41 assert logoscraper.format_image_source(html, test_base_url) == test_base_url42def test_format_image_source_no_source():43 html = BeautifulSoup(f"<img></โimg>", "html.parser").find("img")44 assert logoscraper.format_image_source(html, test_base_url) is None45def test_format_image_source_relative_path():46 html = BeautifulSoup(47 f"<img src='resources/โimages/โimage.png'></โimg>", "html.parser"48 ).find("img")49 assert (50 logoscraper.format_image_source(html, test_base_url)51 == f"{test_base_url}/โresources/โimages/โimage.png"...
Check out the latest blogs from LambdaTest on this topic:
There are times when developers get stuck with a problem that has to do with version changes. Trying to run the code or test without upgrading the package can result in unexpected errors.
Automating testing is a crucial step in the development pipeline of a software product. In an agile development environment, where there is continuous development, deployment, and maintenance of software products, automation testing ensures that the end software products delivered are error-free.
Web applications continue to evolve at an unbelievable pace, and the architecture surrounding web apps get more complicated all of the time. With the growth in complexity of the web application and the development process, web application testing also needs to keep pace with the ever-changing demands.
โTest frequently and early.โ If youโve been following my testing agenda, youโre probably sick of hearing me repeat that. However, it is making sense that if your tests detect an issue soon after it occurs, it will be easier to resolve. This is one of the guiding concepts that makes continuous integration such an effective method. Iโve encountered several teams who have a lot of automated tests but donโt use them as part of a continuous integration approach. There are frequently various reasons why the team believes these tests cannot be used with continuous integration. Perhaps the tests take too long to run, or they are not dependable enough to provide correct results on their own, necessitating human interpretation.
Hey LambdaTesters! Weโve got something special for you this week. ????
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!