Best Python code snippet using selene_python
test_tree.py
Source:test_tree.py
1import datetime2import difflib3import textwrap4from decimal import Decimal5from email.utils import format_datetime6from unittest import TestCase7import requests_mock8from dateutil.tz import tzoffset9from tests.helpers import gzip10from usp.log import create_logger11from usp.objects.page import (12 SitemapPage,13 SitemapNewsStory,14 SitemapPageChangeFrequency,15)16from usp.objects.sitemap import (17 IndexRobotsTxtSitemap,18 PagesXMLSitemap,19 IndexXMLSitemap,20 InvalidSitemap,21 PagesTextSitemap,22 IndexWebsiteSitemap,23 PagesRSSSitemap,24 PagesAtomSitemap,25)26from usp.tree import sitemap_tree_for_homepage27# FIXME various exotic properties28# FIXME XML vulnerabilities with Expat29# FIXME max. recursion level30# FIXME tests responses that are too big31log = create_logger(__name__)32class TestSitemapTree(TestCase):33 TEST_BASE_URL = 'http://test_ultimate-sitemap-parser.com' # mocked by HTTPretty34 # Publication / "last modified" date35 TEST_DATE_DATETIME = datetime.datetime(36 year=2009, month=12, day=17, hour=12, minute=4, second=56,37 tzinfo=tzoffset(None, 7200),38 )39 TEST_DATE_STR_ISO8601 = TEST_DATE_DATETIME.isoformat()40 """Test string date formatted as ISO 8601 (for XML and Atom 0.3 / 1.0 sitemaps)."""41 TEST_DATE_STR_RFC2822 = format_datetime(TEST_DATE_DATETIME)42 """Test string date formatted as RFC 2822 (for RSS 2.0 sitemaps)."""43 TEST_PUBLICATION_NAME = 'Test publication'44 TEST_PUBLICATION_LANGUAGE = 'en'45 @staticmethod46 def fallback_to_404_not_found_matcher(request):47 """Reply with "404 Not Found" to unmatched URLs instead of throwing NoMockAddress."""48 return requests_mock.create_response(49 request,50 status_code=404,51 reason='Not Found',52 headers={'Content-Type': 'text/html'},53 text="<h1>404 Not Found!</h1>",54 )55 # noinspection DuplicatedCode56 def test_sitemap_tree_for_homepage(self):57 """Test sitemap_tree_for_homepage()."""58 with requests_mock.Mocker() as m:59 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)60 m.get(61 self.TEST_BASE_URL + '/',62 text='This is a homepage.',63 )64 m.get(65 self.TEST_BASE_URL + '/robots.txt',66 headers={'Content-Type': 'text/plain'},67 text=textwrap.dedent("""68 User-agent: *69 Disallow: /whatever70 71 Sitemap: {base_url}/sitemap_pages.xml72 73 # Intentionally spelled as "Site-map" as Google tolerates this:74 # https://github.com/google/robotstxt/blob/master/robots.cc#L703 75 Site-map: {base_url}/sitemap_news_index_1.xml76 """.format(base_url=self.TEST_BASE_URL)).strip(),77 )78 # One sitemap for random static pages79 m.get(80 self.TEST_BASE_URL + '/sitemap_pages.xml',81 headers={'Content-Type': 'application/xml'},82 text=textwrap.dedent("""83 <?xml version="1.0" encoding="UTF-8"?>84 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">85 <url>86 <loc>{base_url}/about.html</loc>87 <lastmod>{last_modified_date}</lastmod>88 <changefreq>monthly</changefreq>89 <priority>0.8</priority>90 </url>91 <url>92 <loc>{base_url}/contact.html</loc>93 <lastmod>{last_modified_date}</lastmod>94 95 <!-- Invalid change frequency -->96 <changefreq>when we feel like it</changefreq>97 98 <!-- Invalid priority -->99 <priority>1.1</priority>100 101 </url>102 </urlset>103 """.format(base_url=self.TEST_BASE_URL, last_modified_date=self.TEST_DATE_STR_ISO8601)).strip(),104 )105 # Index sitemap pointing to sitemaps with stories106 m.get(107 self.TEST_BASE_URL + '/sitemap_news_index_1.xml',108 headers={'Content-Type': 'application/xml'},109 text=textwrap.dedent("""110 <?xml version="1.0" encoding="UTF-8"?>111 <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">112 <sitemap>113 <loc>{base_url}/sitemap_news_1.xml</loc>114 <lastmod>{last_modified}</lastmod>115 </sitemap>116 <sitemap>117 <loc>{base_url}/sitemap_news_index_2.xml</loc>118 <lastmod>{last_modified}</lastmod>119 </sitemap>120 </sitemapindex>121 """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(),122 )123 # First sitemap with actual stories124 m.get(125 self.TEST_BASE_URL + '/sitemap_news_1.xml',126 headers={'Content-Type': 'application/xml'},127 text=textwrap.dedent("""128 <?xml version="1.0" encoding="UTF-8"?>129 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"130 xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"131 xmlns:xhtml="http://www.w3.org/1999/xhtml">132 133 <url>134 <loc>{base_url}/news/foo.html</loc>135 136 <!-- Element present but empty -->137 <lastmod />138 139 <!-- Some other XML namespace -->140 <xhtml:link rel="alternate"141 media="only screen and (max-width: 640px)"142 href="{base_url}/news/foo.html?mobile=1" />143 144 <news:news>145 <news:publication>146 <news:name>{publication_name}</news:name>147 <news:language>{publication_language}</news:language>148 </news:publication>149 <news:publication_date>{publication_date}</news:publication_date>150 <news:title>Foo <foo></news:title> <!-- HTML entity decoding -->151 </news:news>152 </url>153 154 <!-- Has a duplicate story in /sitemap_news_2.xml -->155 <url>156 <loc>{base_url}/news/bar.html</loc>157 <xhtml:link rel="alternate"158 media="only screen and (max-width: 640px)"159 href="{base_url}/news/bar.html?mobile=1" />160 <news:news>161 <news:publication>162 <news:name>{publication_name}</news:name>163 <news:language>{publication_language}</news:language>164 </news:publication>165 <news:publication_date>{publication_date}</news:publication_date>166 <news:title>Bar & bar</news:title>167 </news:news>168 </url>169 170 </urlset>171 """.format(172 base_url=self.TEST_BASE_URL,173 publication_name=self.TEST_PUBLICATION_NAME,174 publication_language=self.TEST_PUBLICATION_LANGUAGE,175 publication_date=self.TEST_DATE_STR_ISO8601,176 )).strip(),177 )178 # Another index sitemap pointing to a second sitemaps with stories179 m.get(180 self.TEST_BASE_URL + '/sitemap_news_index_2.xml',181 headers={'Content-Type': 'application/xml'},182 text=textwrap.dedent("""183 <?xml version="1.0" encoding="UTF-8"?>184 <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">185 186 <sitemap>187 <!-- Extra whitespace added around URL -->188 <loc> {base_url}/sitemap_news_2.xml </loc>189 <lastmod>{last_modified}</lastmod>190 </sitemap>191 192 <!-- Nonexistent sitemap -->193 <sitemap>194 <loc>{base_url}/sitemap_news_missing.xml</loc>195 <lastmod>{last_modified}</lastmod>196 </sitemap>197 198 </sitemapindex>199 """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(),200 )201 # Second sitemap with actual stories202 m.get(203 self.TEST_BASE_URL + '/sitemap_news_2.xml',204 headers={'Content-Type': 'application/xml'},205 text=textwrap.dedent("""206 <?xml version="1.0" encoding="UTF-8"?>207 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"208 xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"209 xmlns:xhtml="http://www.w3.org/1999/xhtml">210 211 <!-- Has a duplicate story in /sitemap_news_1.xml -->212 <url>213 <!-- Extra whitespace added around URL -->214 <loc> {base_url}/news/bar.html </loc>215 <xhtml:link rel="alternate"216 media="only screen and (max-width: 640px)"217 href="{base_url}/news/bar.html?mobile=1#fragment_is_to_be_removed" />218 <news:news>219 <news:publication>220 <news:name>{publication_name}</news:name>221 <news:language>{publication_language}</news:language>222 </news:publication>223 <news:publication_date>{publication_date}</news:publication_date>224 225 <tag_without_inner_character_data name="value" />226 227 <news:title>Bar & bar</news:title>228 </news:news>229 </url>230 231 <url>232 <loc>{base_url}/news/baz.html</loc>233 <xhtml:link rel="alternate"234 media="only screen and (max-width: 640px)"235 href="{base_url}/news/baz.html?mobile=1" />236 <news:news>237 <news:publication>238 <news:name>{publication_name}</news:name>239 <news:language>{publication_language}</news:language>240 </news:publication>241 <news:publication_date>{publication_date}</news:publication_date>242 <news:title><![CDATA[BÄ
ž]]></news:title> <!-- CDATA and UTF-8 -->243 </news:news>244 </url>245 246 </urlset>247 """.format(248 base_url=self.TEST_BASE_URL,249 publication_name=self.TEST_PUBLICATION_NAME,250 publication_language=self.TEST_PUBLICATION_LANGUAGE,251 publication_date=self.TEST_DATE_STR_ISO8601,252 )).strip(),253 )254 # Nonexistent sitemap255 m.get(256 self.TEST_BASE_URL + '/sitemap_news_missing.xml',257 status_code=404,258 reason='Not Found',259 headers={'Content-Type': 'text/html'},260 text="<h1>404 Not Found!</h1>",261 )262 expected_sitemap_tree = IndexWebsiteSitemap(263 url='{}/'.format(self.TEST_BASE_URL),264 sub_sitemaps=[265 IndexRobotsTxtSitemap(266 url='{}/robots.txt'.format(self.TEST_BASE_URL),267 sub_sitemaps=[268 PagesXMLSitemap(269 url='{}/sitemap_pages.xml'.format(self.TEST_BASE_URL),270 pages=[271 SitemapPage(272 url='{}/about.html'.format(self.TEST_BASE_URL),273 last_modified=self.TEST_DATE_DATETIME,274 news_story=None,275 change_frequency=SitemapPageChangeFrequency.MONTHLY,276 priority=Decimal('0.8'),277 ),278 SitemapPage(279 url='{}/contact.html'.format(self.TEST_BASE_URL),280 last_modified=self.TEST_DATE_DATETIME,281 news_story=None,282 # Invalid input -- should be reset to "always"283 change_frequency=SitemapPageChangeFrequency.ALWAYS,284 # Invalid input -- should be reset to 0.5 (the default as per the spec)285 priority=Decimal('0.5'),286 )287 ],288 ),289 IndexXMLSitemap(290 url='{}/sitemap_news_index_1.xml'.format(self.TEST_BASE_URL),291 sub_sitemaps=[292 PagesXMLSitemap(293 url='{}/sitemap_news_1.xml'.format(self.TEST_BASE_URL),294 pages=[295 SitemapPage(296 url='{}/news/foo.html'.format(self.TEST_BASE_URL),297 news_story=SitemapNewsStory(298 title='Foo <foo>',299 publish_date=self.TEST_DATE_DATETIME,300 publication_name=self.TEST_PUBLICATION_NAME,301 publication_language=self.TEST_PUBLICATION_LANGUAGE,302 ),303 ),304 SitemapPage(305 url='{}/news/bar.html'.format(self.TEST_BASE_URL),306 news_story=SitemapNewsStory(307 title='Bar & bar',308 publish_date=self.TEST_DATE_DATETIME,309 publication_name=self.TEST_PUBLICATION_NAME,310 publication_language=self.TEST_PUBLICATION_LANGUAGE,311 ),312 ),313 ]314 ),315 IndexXMLSitemap(316 url='{}/sitemap_news_index_2.xml'.format(self.TEST_BASE_URL),317 sub_sitemaps=[318 PagesXMLSitemap(319 url='{}/sitemap_news_2.xml'.format(self.TEST_BASE_URL),320 pages=[321 SitemapPage(322 url='{}/news/bar.html'.format(self.TEST_BASE_URL),323 news_story=SitemapNewsStory(324 title='Bar & bar',325 publish_date=self.TEST_DATE_DATETIME,326 publication_name=self.TEST_PUBLICATION_NAME,327 publication_language=self.TEST_PUBLICATION_LANGUAGE,328 ),329 ),330 SitemapPage(331 url='{}/news/baz.html'.format(self.TEST_BASE_URL),332 news_story=SitemapNewsStory(333 title='BÄ
ž',334 publish_date=self.TEST_DATE_DATETIME,335 publication_name=self.TEST_PUBLICATION_NAME,336 publication_language=self.TEST_PUBLICATION_LANGUAGE,337 ),338 ),339 ],340 ),341 InvalidSitemap(342 url='{}/sitemap_news_missing.xml'.format(self.TEST_BASE_URL),343 reason=(344 'Unable to fetch sitemap from {base_url}/sitemap_news_missing.xml: '345 '404 Not Found'346 ).format(base_url=self.TEST_BASE_URL),347 ),348 ],349 ),350 ],351 ),352 ],353 )354 ]355 )356 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)357 expected_lines = str(expected_sitemap_tree).split()358 actual_lines = str(actual_sitemap_tree).split()359 diff = difflib.ndiff(expected_lines, actual_lines)360 diff_str = '\n'.join(diff)361 assert expected_sitemap_tree == actual_sitemap_tree, diff_str362 assert len(list(actual_sitemap_tree.all_pages())) == 6363 def test_sitemap_tree_for_homepage_gzip(self):364 """Test sitemap_tree_for_homepage() with gzipped sitemaps."""365 with requests_mock.Mocker() as m:366 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)367 m.get(368 self.TEST_BASE_URL + '/',369 text='This is a homepage.',370 )371 m.get(372 self.TEST_BASE_URL + '/robots.txt',373 headers={'Content-Type': 'text/plain'},374 text=textwrap.dedent("""375 User-agent: *376 Disallow: /whatever377 378 Sitemap: {base_url}/sitemap_1.gz379 Sitemap: {base_url}/sitemap_2.dat380 Sitemap: {base_url}/sitemap_3.xml.gz381 """.format(base_url=self.TEST_BASE_URL)).strip(),382 )383 # Gzipped sitemap without correct HTTP header but with .gz extension384 m.get(385 self.TEST_BASE_URL + '/sitemap_1.gz',386 content=gzip(textwrap.dedent("""387 <?xml version="1.0" encoding="UTF-8"?>388 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"389 xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">390 <url>391 <loc>{base_url}/news/foo.html</loc>392 <news:news>393 <news:publication>394 <news:name>{publication_name}</news:name>395 <news:language>{publication_language}</news:language>396 </news:publication>397 <news:publication_date>{publication_date}</news:publication_date>398 <news:title>Foo <foo></news:title> <!-- HTML entity decoding -->399 </news:news>400 </url>401 </urlset>402 """.format(403 base_url=self.TEST_BASE_URL,404 publication_name=self.TEST_PUBLICATION_NAME,405 publication_language=self.TEST_PUBLICATION_LANGUAGE,406 publication_date=self.TEST_DATE_STR_ISO8601,407 )).strip()),408 )409 # Gzipped sitemap with correct HTTP header but without .gz extension410 m.get(411 self.TEST_BASE_URL + '/sitemap_2.dat',412 headers={'Content-Type': 'application/x-gzip'},413 content=gzip(textwrap.dedent("""414 <?xml version="1.0" encoding="UTF-8"?>415 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"416 xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">417 <url>418 <loc>{base_url}/news/bar.html</loc>419 <news:news>420 <news:publication>421 <news:name>{publication_name}</news:name>422 <news:language>{publication_language}</news:language>423 </news:publication>424 <news:publication_date>{publication_date}</news:publication_date>425 <news:title><![CDATA[BÄ
r]]></news:title> <!-- CDATA and UTF-8 -->426 </news:news>427 </url>428 </urlset>429 """.format(430 base_url=self.TEST_BASE_URL,431 publication_name=self.TEST_PUBLICATION_NAME,432 publication_language=self.TEST_PUBLICATION_LANGUAGE,433 publication_date=self.TEST_DATE_STR_ISO8601,434 )).strip()),435 )436 # Sitemap which appears to be gzipped (due to extension and Content-Type) but really isn't437 m.get(438 self.TEST_BASE_URL + '/sitemap_3.xml.gz',439 headers={'Content-Type': 'application/x-gzip'},440 text=textwrap.dedent("""441 <?xml version="1.0" encoding="UTF-8"?>442 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"443 xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">444 <url>445 <loc>{base_url}/news/baz.html</loc>446 <news:news>447 <news:publication>448 <news:name>{publication_name}</news:name>449 <news:language>{publication_language}</news:language>450 </news:publication>451 <news:publication_date>{publication_date}</news:publication_date>452 <news:title><![CDATA[BÄ
ž]]></news:title> <!-- CDATA and UTF-8 -->453 </news:news>454 </url>455 </urlset>456 """.format(457 base_url=self.TEST_BASE_URL,458 publication_name=self.TEST_PUBLICATION_NAME,459 publication_language=self.TEST_PUBLICATION_LANGUAGE,460 publication_date=self.TEST_DATE_STR_ISO8601,461 )).strip(),462 )463 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)464 # Don't do an in-depth check, we just need to make sure that gunzip works465 assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)466 assert len(actual_sitemap_tree.sub_sitemaps) == 1467 assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)468 # noinspection PyUnresolvedReferences469 assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 3470 # noinspection PyUnresolvedReferences471 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]472 assert isinstance(sitemap_1, PagesXMLSitemap)473 assert len(sitemap_1.pages) == 1474 # noinspection PyUnresolvedReferences475 sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]476 assert isinstance(sitemap_2, PagesXMLSitemap)477 assert len(sitemap_2.pages) == 1478 # noinspection PyUnresolvedReferences479 sitemap_3 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[2]480 assert isinstance(sitemap_3, PagesXMLSitemap)481 assert len(sitemap_3.pages) == 1482 def test_sitemap_tree_for_homepage_plain_text(self):483 """Test sitemap_tree_for_homepage() with plain text sitemaps."""484 with requests_mock.Mocker() as m:485 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)486 m.get(487 self.TEST_BASE_URL + '/',488 text='This is a homepage.',489 )490 m.get(491 self.TEST_BASE_URL + '/robots.txt',492 headers={'Content-Type': 'text/plain'},493 text=textwrap.dedent("""494 User-agent: *495 Disallow: /whatever496 497 Sitemap: {base_url}/sitemap_1.txt498 Sitemap: {base_url}/sitemap_2.txt.dat499 """.format(base_url=self.TEST_BASE_URL)).strip(),500 )501 # Plain text uncompressed sitemap (no Content-Type header)502 m.get(503 self.TEST_BASE_URL + '/sitemap_1.txt',504 text=textwrap.dedent("""505 506 {base_url}/news/foo.html507 508 509 {base_url}/news/bar.html510 511 Some other stuff which totally doesn't look like an URL512 """.format(base_url=self.TEST_BASE_URL)).strip(),513 )514 # Plain text compressed sitemap without .gz extension515 m.get(516 self.TEST_BASE_URL + '/sitemap_2.txt.dat',517 headers={'Content-Type': 'application/x-gzip'},518 content=gzip(textwrap.dedent("""519 {base_url}/news/bar.html520 {base_url}/news/baz.html521 """.format(base_url=self.TEST_BASE_URL)).strip()),522 )523 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)524 assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)525 assert len(actual_sitemap_tree.sub_sitemaps) == 1526 assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)527 # noinspection PyUnresolvedReferences528 assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2529 # noinspection PyUnresolvedReferences530 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]531 assert isinstance(sitemap_1, PagesTextSitemap)532 assert len(sitemap_1.pages) == 2533 # noinspection PyUnresolvedReferences534 sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]535 assert isinstance(sitemap_2, PagesTextSitemap)536 assert len(sitemap_2.pages) == 2537 pages = list(actual_sitemap_tree.all_pages())538 assert len(pages) == 4539 assert SitemapPage(url='{}/news/foo.html'.format(self.TEST_BASE_URL)) in pages540 assert SitemapPage(url='{}/news/bar.html'.format(self.TEST_BASE_URL)) in pages541 assert SitemapPage(url='{}/news/baz.html'.format(self.TEST_BASE_URL)) in pages542 # noinspection DuplicatedCode543 def test_sitemap_tree_for_homepage_rss_atom(self):544 """Test sitemap_tree_for_homepage() with RSS 2.0 / Atom 0.3 / Atom 1.0 feeds."""545 with requests_mock.Mocker() as m:546 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)547 m.get(548 self.TEST_BASE_URL + '/',549 text='This is a homepage.',550 )551 m.get(552 self.TEST_BASE_URL + '/robots.txt',553 headers={'Content-Type': 'text/plain'},554 text=textwrap.dedent("""555 User-agent: *556 Disallow: /whatever557 Sitemap: {base_url}/sitemap_rss.xml558 Sitemap: {base_url}/sitemap_atom_0_3.xml559 Sitemap: {base_url}/sitemap_atom_1_0.xml560 """.format(base_url=self.TEST_BASE_URL)).strip(),561 )562 # RSS 2.0 sitemap563 m.get(564 self.TEST_BASE_URL + '/sitemap_rss.xml',565 headers={'Content-Type': 'application/rss+xml'},566 text=textwrap.dedent("""567 <?xml version="1.0" encoding="UTF-8"?>568 <rss version="2.0">569 <channel>570 <title>Test RSS 2.0 feed</title>571 <description>This is a test RSS 2.0 feed.</description>572 <link>{base_url}</link>573 <pubDate>{pub_date}</pubDate>574 <item>575 <title>Test RSS 2.0 story #1</title>576 <description>This is a test RSS 2.0 story #1.</description>577 <link>{base_url}/rss_story_1.html</link>578 <guid isPermaLink="true">{base_url}/rss_story_1.html</guid>579 <pubDate>{pub_date}</pubDate>580 </item>581 <item>582 <title>Test RSS 2.0 story #2</title>583 <description>This is a test RSS 2.0 story #2.</description>584 <link>{base_url}/rss_story_2.html</link>585 <guid isPermaLink="true">{base_url}/rss_story_2.html</guid>586 <pubDate>{pub_date}</pubDate>587 </item>588 </channel>589 </rss>590 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(),591 )592 # Atom 0.3 sitemap593 m.get(594 self.TEST_BASE_URL + '/sitemap_atom_0_3.xml',595 headers={'Content-Type': 'application/atom+xml'},596 text=textwrap.dedent("""597 <?xml version="1.0" encoding="UTF-8"?>598 <feed version="0.3" xmlns="http://purl.org/atom/ns#">599 <title>Test Atom 0.3 feed</title>600 <link rel="alternate" type="text/html" href="{base_url}" />601 <modified>{pub_date}</modified>602 <entry>603 <title>Test Atom 0.3 story #1</title>604 <link rel="alternate" type="text/html" href="{base_url}/atom_0_3_story_1.html" />605 <id>{base_url}/atom_0_3_story_1.html</id>606 <issued>{pub_date}</issued>607 </entry>608 <entry>609 <title>Test Atom 0.3 story #2</title>610 <link rel="alternate" type="text/html" href="{base_url}/atom_0_3_story_2.html" />611 <id>{base_url}/atom_0_3_story_2.html</id>612 <issued>{pub_date}</issued>613 </entry>614 </feed>615 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),616 )617 # Atom 1.0 sitemap618 m.get(619 self.TEST_BASE_URL + '/sitemap_atom_1_0.xml',620 headers={'Content-Type': 'application/atom+xml'},621 text=textwrap.dedent("""622 <?xml version="1.0" encoding="UTF-8"?>623 <feed xmlns="http://www.w3.org/2005/Atom">624 <title>Test Atom 1.0 feed</title>625 <subtitle>This is a test Atom 1.0 feed.</subtitle>626 <link href="{base_url}/sitemap_atom_1_0.xml" rel="self" />627 <link href="{base_url}" />628 <id>{base_url}</id>629 <updated>{pub_date}</updated>630 <entry>631 <title>Test Atom 1.0 story #1</title>632 <link href="{base_url}/atom_1_0_story_1.html" />633 <link rel="alternate" type="text/html" href="{base_url}/atom_1_0_story_1.html?alt" />634 <link rel="edit" href="{base_url}/atom_1_0_story_1.html?edit" />635 <id>{base_url}/atom_1_0_story_1.html</id>636 <updated>{pub_date}</updated>637 <summary>This is test atom 1.0 story #1.</summary>638 <content type="xhtml">639 <div xmlns="http://www.w3.org/1999/xhtml">640 <p>This is test atom 1.0 story #1.</p>641 </div>642 </content>643 <author>644 <name>John Doe</name>645 <email>johndoe@example.com</email>646 </author>647 </entry>648 <entry>649 <title>Test Atom 1.0 story #2</title>650 <link href="{base_url}/atom_1_0_story_2.html" />651 <link rel="alternate" type="text/html" href="{base_url}/atom_1_0_story_2.html?alt" />652 <link rel="edit" href="{base_url}/atom_1_0_story_2.html?edit" />653 <id>{base_url}/atom_1_0_story_2.html</id>654 <updated>{pub_date}</updated>655 <summary>This is test atom 1.0 story #2.</summary>656 <content type="xhtml">657 <div xmlns="http://www.w3.org/1999/xhtml">658 <p>This is test atom 1.0 story #2.</p>659 </div>660 </content>661 <author>662 <name>John Doe</name>663 <email>johndoe@example.com</email>664 </author>665 </entry>666 </feed>667 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),668 )669 expected_sitemap_tree = IndexWebsiteSitemap(670 url='{}/'.format(self.TEST_BASE_URL),671 sub_sitemaps=[672 IndexRobotsTxtSitemap(673 url='{}/robots.txt'.format(self.TEST_BASE_URL),674 sub_sitemaps=[675 PagesRSSSitemap(676 url='{}/sitemap_rss.xml'.format(self.TEST_BASE_URL),677 pages=[678 SitemapPage(679 url='{}/rss_story_1.html'.format(self.TEST_BASE_URL),680 news_story=SitemapNewsStory(681 title='Test RSS 2.0 story #1',682 publish_date=self.TEST_DATE_DATETIME,683 ),684 ),685 SitemapPage(686 url='{}/rss_story_2.html'.format(self.TEST_BASE_URL),687 news_story=SitemapNewsStory(688 title='Test RSS 2.0 story #2',689 publish_date=self.TEST_DATE_DATETIME,690 )691 )692 ]693 ),694 PagesAtomSitemap(695 url='{}/sitemap_atom_0_3.xml'.format(self.TEST_BASE_URL),696 pages=[697 SitemapPage(698 url='{}/atom_0_3_story_1.html'.format(self.TEST_BASE_URL),699 news_story=SitemapNewsStory(700 title='Test Atom 0.3 story #1',701 publish_date=self.TEST_DATE_DATETIME,702 ),703 ),704 SitemapPage(705 url='{}/atom_0_3_story_2.html'.format(self.TEST_BASE_URL),706 news_story=SitemapNewsStory(707 title='Test Atom 0.3 story #2',708 publish_date=self.TEST_DATE_DATETIME,709 )710 )711 ]712 ),713 PagesAtomSitemap(714 url='{}/sitemap_atom_1_0.xml'.format(self.TEST_BASE_URL),715 pages=[716 SitemapPage(717 url='{}/atom_1_0_story_1.html'.format(self.TEST_BASE_URL),718 news_story=SitemapNewsStory(719 title='Test Atom 1.0 story #1',720 publish_date=self.TEST_DATE_DATETIME,721 ),722 ),723 SitemapPage(724 url='{}/atom_1_0_story_2.html'.format(self.TEST_BASE_URL),725 news_story=SitemapNewsStory(726 title='Test Atom 1.0 story #2',727 publish_date=self.TEST_DATE_DATETIME,728 )729 )730 ]731 ),732 ]733 )734 ]735 )736 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)737 expected_lines = str(expected_sitemap_tree).split()738 actual_lines = str(actual_sitemap_tree).split()739 diff = difflib.ndiff(expected_lines, actual_lines)740 diff_str = '\n'.join(diff)741 assert expected_sitemap_tree == actual_sitemap_tree, diff_str742 assert len(list(actual_sitemap_tree.all_pages())) == 6743 def test_sitemap_tree_for_homepage_rss_atom_empty(self):744 """Test sitemap_tree_for_homepage() with empty RSS 2.0 / Atom 0.3 / Atom 1.0 feeds."""745 with requests_mock.Mocker() as m:746 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)747 m.get(748 self.TEST_BASE_URL + '/',749 text='This is a homepage.',750 )751 m.get(752 self.TEST_BASE_URL + '/robots.txt',753 headers={'Content-Type': 'text/plain'},754 text=textwrap.dedent("""755 User-agent: *756 Disallow: /whatever757 Sitemap: {base_url}/sitemap_rss.xml758 Sitemap: {base_url}/sitemap_atom_0_3.xml759 Sitemap: {base_url}/sitemap_atom_1_0.xml760 """.format(base_url=self.TEST_BASE_URL)).strip(),761 )762 # RSS 2.0 sitemap763 m.get(764 self.TEST_BASE_URL + '/sitemap_rss.xml',765 headers={'Content-Type': 'application/rss+xml'},766 text=textwrap.dedent("""767 <?xml version="1.0" encoding="UTF-8"?>768 <rss version="2.0">769 <channel>770 <title>Test RSS 2.0 feed</title>771 <description>This is a test RSS 2.0 feed.</description>772 <link>{base_url}</link>773 <pubDate>{pub_date}</pubDate>774 </channel>775 </rss>776 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(),777 )778 # Atom 0.3 sitemap779 m.get(780 self.TEST_BASE_URL + '/sitemap_atom_0_3.xml',781 headers={'Content-Type': 'application/atom+xml'},782 text=textwrap.dedent("""783 <?xml version="1.0" encoding="UTF-8"?>784 <feed version="0.3" xmlns="http://purl.org/atom/ns#">785 <title>Test Atom 0.3 feed</title>786 <link rel="alternate" type="text/html" href="{base_url}" />787 <modified>{pub_date}</modified>788 </feed>789 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),790 )791 # Atom 1.0 sitemap792 m.get(793 self.TEST_BASE_URL + '/sitemap_atom_1_0.xml',794 headers={'Content-Type': 'application/atom+xml'},795 text=textwrap.dedent("""796 <?xml version="1.0" encoding="UTF-8"?>797 <feed xmlns="http://www.w3.org/2005/Atom">798 <title>Test Atom 1.0 feed</title>799 <subtitle>This is a test Atom 1.0 feed.</subtitle>800 <link href="{base_url}/sitemap_atom_1_0.xml" rel="self" />801 <link href="{base_url}" />802 <id>{base_url}</id>803 <updated>{pub_date}</updated>804 </feed>805 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),806 )807 expected_sitemap_tree = IndexWebsiteSitemap(808 url='{}/'.format(self.TEST_BASE_URL),809 sub_sitemaps=[810 IndexRobotsTxtSitemap(811 url='{}/robots.txt'.format(self.TEST_BASE_URL),812 sub_sitemaps=[813 PagesRSSSitemap(814 url='{}/sitemap_rss.xml'.format(self.TEST_BASE_URL),815 pages=[]816 ),817 PagesAtomSitemap(818 url='{}/sitemap_atom_0_3.xml'.format(self.TEST_BASE_URL),819 pages=[]820 ),821 PagesAtomSitemap(822 url='{}/sitemap_atom_1_0.xml'.format(self.TEST_BASE_URL),823 pages=[]824 ),825 ]826 )827 ]828 )829 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)830 assert expected_sitemap_tree == actual_sitemap_tree831 assert len(list(actual_sitemap_tree.all_pages())) == 0832 def test_sitemap_tree_for_homepage_prematurely_ending_xml(self):833 """Test sitemap_tree_for_homepage() with clipped XML.834 Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the835 server is unable to generate and compress a 50 MB sitemap XML. Google News doesn't seem to have a problem with836 this behavior, so we have to support this too.837 """838 with requests_mock.Mocker() as m:839 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)840 m.get(841 self.TEST_BASE_URL + '/',842 text='This is a homepage.',843 )844 m.get(845 self.TEST_BASE_URL + '/robots.txt',846 headers={'Content-Type': 'text/plain'},847 text=textwrap.dedent("""848 User-agent: *849 Disallow: /whatever850 851 Sitemap: {base_url}/sitemap.xml852 """.format(base_url=self.TEST_BASE_URL)).strip(),853 )854 m.get(855 self.TEST_BASE_URL + '/sitemap.xml',856 text=textwrap.dedent("""857 <?xml version="1.0" encoding="UTF-8"?>858 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"859 xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">860 <url>861 <loc>{base_url}/news/first.html</loc>862 <news:news>863 <news:publication>864 <news:name>{publication_name}</news:name>865 <news:language>{publication_language}</news:language>866 </news:publication>867 <news:publication_date>{publication_date}</news:publication_date>868 <news:title>First story</news:title>869 </news:news>870 </url>871 <url>872 <loc>{base_url}/news/second.html</loc>873 <news:news>874 <news:publication>875 <news:name>{publication_name}</news:name>876 <news:language>{publication_language}</news:language>877 </news:publication>878 <news:publication_date>{publication_date}</news:publication_date>879 <news:title>Second story</news:title>880 </news:news>881 </url>882 883 <!-- The following story shouldn't get added as the XML ends prematurely -->884 <url>885 <loc>{base_url}/news/third.html</loc>886 <news:news>887 <news:publication>888 <news:name>{publication_name}</news:name>889 <news:language>{publication_language}</news:language>890 </news:publication>891 <news:publicat892 """.format(893 base_url=self.TEST_BASE_URL,894 publication_name=self.TEST_PUBLICATION_NAME,895 publication_language=self.TEST_PUBLICATION_LANGUAGE,896 publication_date=self.TEST_DATE_STR_ISO8601,897 )).strip(),898 )899 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)900 assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)901 assert len(actual_sitemap_tree.sub_sitemaps) == 1902 assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)903 # noinspection PyUnresolvedReferences904 assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 1905 # noinspection PyUnresolvedReferences906 sitemap = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]907 assert isinstance(sitemap, PagesXMLSitemap)908 assert len(sitemap.pages) == 2909 def test_sitemap_tree_for_homepage_no_sitemap(self):910 """Test sitemap_tree_for_homepage() with no sitemaps listed in robots.txt."""911 with requests_mock.Mocker() as m:912 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)913 m.get(914 self.TEST_BASE_URL + '/',915 text='This is a homepage.',916 )917 m.get(918 self.TEST_BASE_URL + '/robots.txt',919 headers={'Content-Type': 'text/plain'},920 text=textwrap.dedent("""921 User-agent: *922 Disallow: /whatever923 """.format(base_url=self.TEST_BASE_URL)).strip(),924 )925 expected_sitemap_tree = IndexWebsiteSitemap(926 url='{}/'.format(self.TEST_BASE_URL),927 sub_sitemaps=[928 IndexRobotsTxtSitemap(929 url='{}/robots.txt'.format(self.TEST_BASE_URL),930 sub_sitemaps=[],931 )932 ]933 )934 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)935 assert expected_sitemap_tree == actual_sitemap_tree936 def test_sitemap_tree_for_homepage_unpublished_sitemap(self):937 """Test sitemap_tree_for_homepage() with some sitemaps not published in robots.txt."""938 with requests_mock.Mocker() as m:939 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)940 m.get(941 self.TEST_BASE_URL + '/',942 text='This is a homepage.',943 )944 m.get(945 self.TEST_BASE_URL + '/robots.txt',946 headers={'Content-Type': 'text/plain'},947 text=textwrap.dedent("""948 User-agent: *949 Disallow: /whatever950 951 Sitemap: {base_url}/sitemap_public.xml952 """.format(base_url=self.TEST_BASE_URL)).strip(),953 )954 # Public sitemap (linked to from robots.txt)955 m.get(956 self.TEST_BASE_URL + '/sitemap_public.xml',957 text=textwrap.dedent("""958 <?xml version="1.0" encoding="UTF-8"?>959 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">960 <url>961 <loc>{base_url}/news/public.html</loc>962 </url>963 </urlset>964 """.format(965 base_url=self.TEST_BASE_URL,966 publication_name=self.TEST_PUBLICATION_NAME,967 publication_language=self.TEST_PUBLICATION_LANGUAGE,968 publication_date=self.TEST_DATE_STR_ISO8601,969 )).strip(),970 )971 # Private sitemap (to be discovered by trying out a few paths)972 m.get(973 self.TEST_BASE_URL + '/sitemap_index.xml',974 text=textwrap.dedent("""975 <?xml version="1.0" encoding="UTF-8"?>976 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">977 <url>978 <loc>{base_url}/news/private.html</loc>979 </url>980 </urlset>981 """.format(982 base_url=self.TEST_BASE_URL,983 publication_name=self.TEST_PUBLICATION_NAME,984 publication_language=self.TEST_PUBLICATION_LANGUAGE,985 publication_date=self.TEST_DATE_STR_ISO8601,986 )).strip(),987 )988 expected_sitemap_tree = IndexWebsiteSitemap(989 url='{}/'.format(self.TEST_BASE_URL),990 sub_sitemaps=[991 IndexRobotsTxtSitemap(992 url='{}/robots.txt'.format(self.TEST_BASE_URL),993 sub_sitemaps=[994 PagesXMLSitemap(995 url='{}/sitemap_public.xml'.format(self.TEST_BASE_URL),996 pages=[997 SitemapPage(998 url='{}/news/public.html'.format(self.TEST_BASE_URL),999 ),1000 ],1001 ),1002 ],1003 ),1004 PagesXMLSitemap(1005 url='{}/sitemap_index.xml'.format(self.TEST_BASE_URL),1006 pages=[1007 SitemapPage(1008 url='{}/news/private.html'.format(self.TEST_BASE_URL),1009 ),1010 ],1011 ),1012 ]1013 )1014 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1015 assert expected_sitemap_tree == actual_sitemap_tree1016 def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self):1017 """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt."""1018 with requests_mock.Mocker() as m:1019 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1020 m.get(1021 self.TEST_BASE_URL + '/',1022 text='This is a homepage.',1023 )1024 m.get(1025 self.TEST_BASE_URL + '/robots.txt',1026 headers={'Content-Type': ''},1027 text=textwrap.dedent("""1028 User-agent: *1029 Disallow: /whatever1030 """.format(base_url=self.TEST_BASE_URL)).strip(),1031 )1032 expected_sitemap_tree = IndexWebsiteSitemap(1033 url='{}/'.format(self.TEST_BASE_URL),1034 sub_sitemaps=[1035 IndexRobotsTxtSitemap(1036 url='{}/robots.txt'.format(self.TEST_BASE_URL),1037 sub_sitemaps=[],1038 )1039 ]1040 )1041 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1042 assert expected_sitemap_tree == actual_sitemap_tree1043 def test_sitemap_tree_for_homepage_no_robots_txt(self):1044 """Test sitemap_tree_for_homepage() with no robots.txt."""1045 with requests_mock.Mocker() as m:1046 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1047 m.get(1048 self.TEST_BASE_URL + '/',1049 text='This is a homepage.',1050 )1051 # Nonexistent robots.txt1052 m.get(1053 self.TEST_BASE_URL + '/robots.txt',1054 status_code=404,1055 reason='Not Found',1056 headers={'Content-Type': 'text/html'},1057 text="<h1>404 Not Found!</h1>",1058 )1059 expected_sitemap_tree = IndexWebsiteSitemap(1060 url='{}/'.format(self.TEST_BASE_URL),1061 sub_sitemaps=[1062 InvalidSitemap(1063 url='{}/robots.txt'.format(self.TEST_BASE_URL),1064 reason=(1065 'Unable to fetch sitemap from {base_url}/robots.txt: 404 Not Found'1066 ).format(base_url=self.TEST_BASE_URL),1067 )1068 ]1069 )1070 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1071 assert expected_sitemap_tree == actual_sitemap_tree1072 def test_sitemap_tree_for_homepage_huge_sitemap(self):1073 """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling)."""1074 page_count = 10001075 sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?>1076 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"1077 xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"1078 xmlns:xhtml="http://www.w3.org/1999/xhtml">1079 """1080 for x in range(page_count):1081 sitemap_xml += """1082 <url>1083 <loc>{base_url}/news/page_{x}.html</loc>1084 <!-- Element present but empty -->1085 <lastmod />1086 <!-- Some other XML namespace -->1087 <xhtml:link rel="alternate"1088 media="only screen and (max-width: 640px)"1089 href="{base_url}/news/page_{x}.html?mobile=1" />1090 <news:news>1091 <news:publication>1092 <news:name>{publication_name}</news:name>1093 <news:language>{publication_language}</news:language>1094 </news:publication>1095 <news:publication_date>{publication_date}</news:publication_date>1096 <news:title>Foo <foo></news:title> <!-- HTML entity decoding -->1097 </news:news>1098 </url>1099 """.format(1100 x=x,1101 base_url=self.TEST_BASE_URL,1102 publication_name=self.TEST_PUBLICATION_NAME,1103 publication_language=self.TEST_PUBLICATION_LANGUAGE,1104 publication_date=self.TEST_DATE_STR_ISO8601,1105 )1106 sitemap_xml += "</urlset>"1107 with requests_mock.Mocker() as m:1108 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1109 m.get(1110 self.TEST_BASE_URL + '/',1111 text='This is a homepage.',1112 )1113 m.get(1114 self.TEST_BASE_URL + '/robots.txt',1115 headers={'Content-Type': 'text/plain'},1116 text=textwrap.dedent("""1117 User-agent: *1118 Disallow: /whatever1119 1120 Sitemap: {base_url}/sitemap.xml.gz1121 """.format(base_url=self.TEST_BASE_URL)).strip(),1122 )1123 m.get(1124 self.TEST_BASE_URL + '/sitemap.xml.gz',1125 headers={'Content-Type': 'application/x-gzip'},1126 content=gzip(sitemap_xml),1127 )1128 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1129 assert len(list(actual_sitemap_tree.all_pages())) == page_count1130 def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self):1131 """Test sitemap_tree_for_homepage() with weird (but valid) spacing."""1132 with requests_mock.Mocker() as m:1133 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1134 m.get(1135 self.TEST_BASE_URL + '/',1136 text='This is a homepage.',1137 )1138 robots_txt_body = ""1139 robots_txt_body += "User-agent: *\n"1140 # Extra space before "Sitemap:", no space after "Sitemap:", and extra space after sitemap URL1141 robots_txt_body += " Sitemap:{base_url}/sitemap.xml ".format(base_url=self.TEST_BASE_URL)1142 m.get(1143 self.TEST_BASE_URL + '/robots.txt',1144 headers={'Content-Type': 'text/plain'},1145 text=robots_txt_body,1146 )1147 m.get(1148 self.TEST_BASE_URL + '/sitemap.xml',1149 text=textwrap.dedent("""1150 <?xml version="1.0" encoding="UTF-8"?>1151 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"1152 xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">1153 <url>1154 <loc>{base_url}/news/first.html</loc>1155 <news:news>1156 <news:publication>1157 <news:name>{publication_name}</news:name>1158 <news:language>{publication_language}</news:language>1159 </news:publication>1160 <news:publication_date>{publication_date}</news:publication_date>1161 <news:title>First story</news:title>1162 </news:news>1163 </url>1164 </urlset>1165 """.format(1166 base_url=self.TEST_BASE_URL,1167 publication_name=self.TEST_PUBLICATION_NAME,1168 publication_language=self.TEST_PUBLICATION_LANGUAGE,1169 publication_date=self.TEST_DATE_STR_ISO8601,1170 )).strip(),1171 )1172 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1173 assert len(list(actual_sitemap_tree.all_pages())) == 11174 def test_sitemap_tree_for_homepage_utf8_bom(self):1175 """Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap."""1176 robots_txt_body = textwrap.dedent("""1177 User-agent: *1178 Disallow: /whatever1179 Sitemap: {base_url}/sitemap.xml1180 """.format(base_url=self.TEST_BASE_URL)).strip()1181 sitemap_xml_body = textwrap.dedent("""1182 <?xml version="1.0" encoding="UTF-8"?>1183 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"1184 xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">1185 <url>1186 <loc>{base_url}/news/first.html</loc>1187 <news:news>1188 <news:publication>1189 <news:name>{publication_name}</news:name>1190 <news:language>{publication_language}</news:language>1191 </news:publication>1192 <news:publication_date>{publication_date}</news:publication_date>1193 <news:title>First story</news:title>1194 </news:news>1195 </url>1196 </urlset>1197 """.format(1198 base_url=self.TEST_BASE_URL,1199 publication_name=self.TEST_PUBLICATION_NAME,1200 publication_language=self.TEST_PUBLICATION_LANGUAGE,1201 publication_date=self.TEST_DATE_STR_ISO8601,1202 )).strip()1203 robots_txt_body_encoded = robots_txt_body.encode('utf-8-sig')1204 sitemap_xml_body_encoded = sitemap_xml_body.encode('utf-8-sig')1205 with requests_mock.Mocker() as m:1206 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1207 m.get(1208 self.TEST_BASE_URL + '/',1209 text='This is a homepage.',1210 )1211 m.get(1212 self.TEST_BASE_URL + '/robots.txt',1213 headers={'Content-Type': 'text/plain'},1214 content=robots_txt_body_encoded,1215 )1216 m.get(1217 self.TEST_BASE_URL + '/sitemap.xml',1218 content=sitemap_xml_body_encoded,1219 )1220 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)...
combinations.py
Source:combinations.py
1'''Test cases arguments combinations.'''2import os3import sys4import inflection5TEST_DIR = os.path.abspath(os.path.dirname(__file__))6if TEST_DIR not in sys.path:7 sys.path.append(TEST_DIR)8from consts import TEMPDIR, TEST_BASE_URL # noqa: E4029from http_request_codegen.hrc_string import replace_multiple # noqa: E40210def argument_combination_to_filename(combination_name, index):11 return '{}.{}.expect.txt'.format(12 str(index).zfill(3),13 inflection.parameterize(14 replace_multiple(15 combination_name, replacements={16 '"': '-double-quote-',17 '\'': '-single-quote-',18 },19 ),20 ),21 )22def combination_arguments_to_kwargs(arguments):23 kwargs = {}24 for key, value in arguments.items():25 if key == 'kwargs':26 kwargs.update(value)27 else:28 kwargs[key] = value29 return kwargs30def get_argument_combinations(31 method='GET', include_filenames=True,32 dirpath=None,33):34 response = [35 {36 'name': 'URL',37 'arguments': {38 'url': TEST_BASE_URL,39 },40 },41 {42 'name': 'URL wrapping (no wrap)',43 'arguments': {44 'url': TEST_BASE_URL,45 'wrap': 99999,46 },47 },48 {49 'name': 'URL wrapping (wrap 15)',50 'arguments': {51 'url': TEST_BASE_URL,52 'wrap': 15,53 },54 },55 {56 'name': 'Parameter',57 'arguments': {58 'url': TEST_BASE_URL,59 'parameters': [60 {61 'name': 'param-1',62 'value': 'value-1',63 },64 ],65 },66 },67 {68 'name': 'Parameters',69 'arguments': {70 'url': TEST_BASE_URL,71 'parameters': [72 {73 'name': 'param-1',74 'value': 'foo',75 },76 {77 'name': 'param-2',78 'value': 1,79 },80 {81 'name': 'param-3',82 'value': .777,83 },84 {85 'name': 'param-4',86 'value': True,87 },88 ],89 },90 },91 {92 'name': 'Parameter wrapping value',93 'arguments': {94 'url': TEST_BASE_URL,95 'parameters': [96 {97 'name': 'param-1',98 'value': 'foo-bar-baz' * 50,99 },100 ],101 },102 },103 {104 'name': 'Parameters, one wrapping value',105 'arguments': {106 'url': TEST_BASE_URL,107 'parameters': [108 {109 'name': 'param-1',110 'value': 'foo-bar-baz' * 50,111 },112 {113 'name': 'param-2',114 'value': 'value-2',115 },116 ],117 },118 },119 {120 'name': 'Parameter escaping quotes',121 'arguments': {122 'url': TEST_BASE_URL,123 'parameters': [124 {125 'name': 'param-1-with-\'\'-quotes',126 'value': 'value-1-with-\'\'-quotes',127 },128 ],129 },130 },131 {132 'name': 'URL + header',133 'arguments': {134 'url': TEST_BASE_URL,135 'headers': {136 'Content-Type': 'application/json',137 },138 },139 },140 {141 'name': 'URL + headers',142 'arguments': {143 'url': TEST_BASE_URL,144 'headers': {145 'Content-Type': 'application/json',146 'Accept-Language': 'es',147 },148 },149 },150 {151 'name': 'URL + header wrapping value',152 'arguments': {153 'url': TEST_BASE_URL,154 'headers': {155 'Content-Type': 'application/json' * 5,156 },157 },158 },159 {160 'name': 'URL + headers, one wrapping value',161 'arguments': {162 'url': TEST_BASE_URL,163 'headers': {164 'Content-Type': 'application/json' * 5,165 'Accept-Language': '*',166 },167 },168 },169 {170 'name': 'URL + header escaping quotes',171 'arguments': {172 'url': TEST_BASE_URL,173 'headers': {174 'Accept-Language': 'Header value with \'\' quotes',175 },176 },177 },178 {179 'name': 'URL + kwarg',180 'arguments': {181 'url': TEST_BASE_URL,182 'kwargs': {183 'timeout': 5,184 },185 },186 },187 {188 'name': 'URL + kwargs',189 'arguments': {190 'url': TEST_BASE_URL,191 'kwargs': {192 'timeout': 5,193 'stream': True,194 },195 },196 },197 {198 'name': 'URL + kwarg escaping quotes',199 'arguments': {200 'url': TEST_BASE_URL,201 'kwargs': {202 'cookies': {203 'foo': 'value with \'\' quotes',204 },205 },206 },207 },208 {209 'name': 'URL + kwarg wrapping value',210 'arguments': {211 'url': TEST_BASE_URL,212 'kwargs': {213 'cookies': {214 'bar': 'foo bar baz ' * 50,215 },216 },217 },218 },219 {220 'name': 'URL + kwargs, one wrapping value',221 'arguments': {222 'url': TEST_BASE_URL,223 'kwargs': {224 'cookies': {225 'bar': 'foo bar baz ' * 50,226 },227 'stream': True,228 },229 },230 },231 {232 'name': 'Parameter + header',233 'arguments': {234 'url': TEST_BASE_URL,235 'parameters': [236 {237 'name': 'param-1',238 'value': 'value-1',239 },240 ],241 'headers': {242 'Content-Type': 'application/json',243 },244 },245 },246 {247 'name': 'Parameter + header (oneline)',248 'arguments': {249 'url': TEST_BASE_URL,250 'parameters': [251 {252 'name': 'param-1',253 'value': 'value-1',254 },255 ],256 'headers': {257 'Content-Type': 'application/json',258 },259 'oneline': True,260 },261 },262 {263 'name': 'Parameters + header',264 'arguments': {265 'url': TEST_BASE_URL,266 'parameters': [267 {268 'name': 'param-1',269 'value': 'value-1',270 },271 {272 'name': 'param-2',273 'value': 'value-2',274 },275 ],276 'headers': {277 'Content-Type': 'application/json',278 },279 },280 },281 {282 'name': 'Parameter + headers',283 'arguments': {284 'url': TEST_BASE_URL,285 'parameters': [286 {287 'name': 'param-1',288 'value': 'value-1',289 },290 ],291 'headers': {292 'Content-Type': 'application/json',293 'Accept-Language': '*',294 },295 },296 },297 {298 'name': 'Parameters + headers',299 'arguments': {300 'url': TEST_BASE_URL,301 'parameters': [302 {303 'name': 'param-1',304 'value': 'value-1',305 },306 {307 'name': 'param-2',308 'value': 'value-2',309 },310 ],311 'headers': {312 'Content-Type': 'application/json',313 'Accept-Language': '*',314 },315 },316 },317 {318 'name': 'Parameter + kwarg',319 'arguments': {320 'url': TEST_BASE_URL,321 'parameters': [322 {323 'name': 'param-1',324 'value': 'value-1',325 },326 ],327 'kwargs': {328 'timeout': 10,329 },330 },331 },332 {333 'name': 'Parameter + kwarg (oneline)',334 'arguments': {335 'url': TEST_BASE_URL,336 'parameters': [337 {338 'name': 'a',339 'value': 'b',340 },341 ],342 'kwargs': {343 'timeout': 10,344 },345 'oneline': True,346 },347 },348 {349 'name': 'Parameters + kwarg',350 'arguments': {351 'url': TEST_BASE_URL,352 'parameters': [353 {354 'name': 'param-1',355 'value': 'value-1',356 },357 {358 'name': 'param-2',359 'value': 'value-2',360 },361 ],362 'kwargs': {363 'timeout': 10,364 },365 },366 },367 {368 'name': 'Parameter + kwargs',369 'arguments': {370 'url': TEST_BASE_URL,371 'parameters': [372 {373 'name': 'param-1',374 'value': 'value-1',375 },376 ],377 'kwargs': {378 'timeout': 10,379 'stream': True,380 },381 },382 },383 {384 'name': 'Parameters + kwargs',385 'arguments': {386 'url': TEST_BASE_URL,387 'parameters': [388 {389 'name': 'param-1',390 'value': 'value-1',391 },392 {393 'name': 'param-2',394 'value': 'value-2',395 },396 ],397 'kwargs': {398 'timeout': 10,399 'stream': True,400 },401 },402 },403 {404 'name': 'URL + header + kwarg',405 'arguments': {406 'url': TEST_BASE_URL,407 'headers': {408 'Content-Type': 'application/json',409 },410 'kwargs': {411 'timeout': 5,412 },413 },414 },415 {416 'name': 'URL + header + kwarg (oneline)',417 'arguments': {418 'url': TEST_BASE_URL,419 'headers': {420 'Content-Type': 'application/json',421 },422 'kwargs': {423 'timeout': 5,424 },425 'oneline': True,426 },427 },428 {429 'name': 'URL + headers + kwarg',430 'arguments': {431 'url': TEST_BASE_URL,432 'headers': {433 'Content-Type': 'application/json',434 'Accept-Language': '*',435 },436 'kwargs': {437 'timeout': 5,438 },439 },440 },441 {442 'name': 'URL + header + kwargs',443 'arguments': {444 'url': TEST_BASE_URL,445 'headers': {446 'Accept-Language': '*',447 },448 'kwargs': {449 'timeout': 5,450 'stream': False,451 },452 },453 },454 {455 'name': 'URL + headers + kwargs',456 'arguments': {457 'url': TEST_BASE_URL,458 'headers': {459 'Content-Type': 'application/json',460 'Accept-Language': '*',461 },462 'kwargs': {463 'timeout': 5,464 'stream': False,465 },466 },467 },468 {469 'name': 'Parameter + header + kwarg',470 'arguments': {471 'url': TEST_BASE_URL,472 'parameters': [473 {474 'name': 'param-1',475 'value': 'value-1',476 },477 ],478 'headers': {479 'Content-Type': 'application/json',480 },481 'kwargs': {482 'timeout': 5,483 },484 },485 },486 {487 'name': 'Parameter + header + kwargs',488 'arguments': {489 'url': TEST_BASE_URL,490 'parameters': [491 {492 'name': 'param-1',493 'value': 'value-1',494 },495 ],496 'headers': {497 'Content-Type': 'application/json',498 },499 'kwargs': {500 'timeout': 5,501 'stream': True,502 },503 },504 },505 {506 'name': 'Parameters + header + kwarg',507 'arguments': {508 'url': TEST_BASE_URL,509 'parameters': [510 {511 'name': 'param-1',512 'value': 'value-1',513 },514 {515 'name': 'param-2',516 'value': 7.77,517 },518 ],519 'headers': {520 'Content-Type': 'application/json',521 },522 'kwargs': {523 'timeout': 5,524 },525 },526 },527 {528 'name': 'Parameters + header + kwargs',529 'arguments': {530 'url': TEST_BASE_URL,531 'parameters': [532 {533 'name': 'param-1',534 'value': 'value-1',535 },536 {537 'name': 'param-2',538 'value': 7.77,539 },540 ],541 'headers': {542 'Content-Type': 'application/json',543 },544 'kwargs': {545 'timeout': 5,546 'stream': False,547 },548 },549 },550 {551 'name': 'Parameters + headers + kwarg',552 'arguments': {553 'url': TEST_BASE_URL,554 'parameters': [555 {556 'name': 'param-1',557 'value': 'value-1',558 },559 {560 'name': 'param-2',561 'value': 7.77,562 },563 ],564 'headers': {565 'Content-Type': 'application/json',566 'Accept-Language': 'fr',567 },568 'kwargs': {569 'timeout': 5,570 },571 },572 },573 {574 'name': 'Parameters + headers + kwargs',575 'arguments': {576 'url': TEST_BASE_URL,577 'parameters': [578 {579 'name': 'param-1',580 'value': 'value-1',581 },582 {583 'name': 'param-2',584 'value': 7.77,585 },586 ],587 'headers': {588 'Content-Type': 'application/json',589 'Accept-Language': 'fr',590 },591 'kwargs': {592 'timeout': 5,593 'stream': True,594 },595 },596 },597 {598 'name': 'Setup',599 'arguments': {600 'url': TEST_BASE_URL,601 'setup': True,602 },603 },604 {605 'name': 'No setup',606 'arguments': {607 'url': TEST_BASE_URL,608 'setup': False,609 },610 },611 {612 'name': 'Custom setup',613 'arguments': {614 'url': TEST_BASE_URL,615 'setup': 'custom_setup=1\n\n',616 },617 },618 {619 'name': 'Custom teardown',620 'arguments': {621 'url': TEST_BASE_URL,622 'teardown': '\n\ncustom_teardown=1',623 },624 },625 {626 'name': 'Quote character \'',627 'arguments': {628 'url': TEST_BASE_URL,629 'quote_char': '\'',630 },631 },632 {633 'name': 'Quote character "',634 'arguments': {635 'url': TEST_BASE_URL,636 'quote_char': '"',637 },638 },639 {640 'name': 'Indent 2 spaces',641 'arguments': {642 'url': TEST_BASE_URL,643 'indent': ' ',644 'headers': {645 'Accept-Language': 'es en fr * ' * 20,646 },647 },648 },649 {650 'name': 'Indent 4 spaces',651 'arguments': {652 'url': TEST_BASE_URL,653 'indent': ' ',654 'headers': {655 'Accept-Language': 'es en fr * ' * 20,656 },657 },658 },659 {660 'name': 'One line',661 'arguments': {662 'url': TEST_BASE_URL,663 'oneline': True,664 },665 },666 {667 'name': 'One line + no setup',668 'arguments': {669 'url': TEST_BASE_URL,670 'oneline': True,671 'setup': False,672 },673 },674 {675 'name': 'Wrap 0',676 'arguments': {677 'url': TEST_BASE_URL,678 'wrap': 0,679 },680 },681 {682 'name': 'Wrap 1',683 'arguments': {684 'url': TEST_BASE_URL,685 'wrap': 1,686 },687 },688 {689 'name': 'Wrap 10',690 'arguments': {691 'url': TEST_BASE_URL,692 'wrap': 10,693 },694 },695 {696 'name': 'Wrap 20',697 'arguments': {698 'url': TEST_BASE_URL,699 'wrap': 20,700 },701 },702 {703 'name': 'Wrap 25',704 'arguments': {705 'url': TEST_BASE_URL,706 'wrap': 25,707 },708 },709 {710 'name': 'Wrap 30',711 'arguments': {712 'url': TEST_BASE_URL,713 'wrap': 30,714 },715 },716 {717 'name': 'Wrap 35',718 'arguments': {719 'url': TEST_BASE_URL,720 'wrap': 35,721 },722 },723 {724 'name': 'Wrap 40',725 'arguments': {726 'url': TEST_BASE_URL,727 'wrap': 40,728 },729 },730 {731 'name': 'Wrap infinite',732 'arguments': {733 'url': TEST_BASE_URL,734 'wrap': float('inf'),735 },736 },737 {738 'name': 'Wrap null is infinite',739 'arguments': {740 'url': TEST_BASE_URL,741 'wrap': None,742 },743 },744 ]745 if method.lower() == 'post':746 response.extend([747 {748 'name': 'Data by parameter (text/plain)',749 'arguments': {750 'url': TEST_BASE_URL,751 'parameters': [752 {753 'name': '',754 'value': 'foo bar baz ' * 3,755 },756 ],757 'headers': {758 'Content-Type': 'text/plain',759 },760 },761 },762 {763 'name': 'Data by parameter (text/plain) wrapping value',764 'arguments': {765 'url': TEST_BASE_URL,766 'parameters': [767 {768 'name': '',769 'value': 'foo bar baz ' * 30,770 },771 ],772 'headers': {773 'Content-Type': 'text/plain',774 },775 },776 },777 {778 'name': 'Data by parameter (application/json)',779 'arguments': {780 'url': TEST_BASE_URL,781 'parameters': [782 {783 'name': 'param-1',784 'value': 'value-1',785 },786 ],787 'headers': {788 'Content-Type': 'application/json',789 },790 },791 },792 {793 'name': 'Data by parameters (application/json)',794 'arguments': {795 'url': TEST_BASE_URL,796 'parameters': [797 {798 'name': 'param-int',799 'value': 1,800 },801 {802 'name': 'param-float',803 'value': .777,804 },805 {806 'name': 'param-bool',807 'value': True,808 },809 ],810 'headers': {811 'Content-Type': 'application/json',812 },813 },814 },815 {816 'name': (817 'Data by parameter'818 ' (application/x-www-form-urlencoded)'819 ),820 'arguments': {821 'url': TEST_BASE_URL,822 'parameters': [823 {824 'name': 'param-1',825 'value': 'value-1',826 },827 ],828 'headers': {829 'Content-Type': 'application/x-www-form-urlencoded',830 },831 },832 },833 {834 'name': (835 'Data by parameters'836 ' (application/x-www-form-urlencoded)'837 ),838 'arguments': {839 'url': TEST_BASE_URL,840 'parameters': [841 {842 'name': 'param-int',843 'value': 1,844 },845 {846 'name': 'param-float',847 'value': .777,848 },849 {850 'name': 'param-bool',851 'value': True,852 },853 ],854 'headers': {855 'Content-Type': 'application/x-www-form-urlencoded',856 },857 },858 },859 {860 'name': 'File by filepath (multipart/form-data)',861 'arguments': {862 'url': TEST_BASE_URL,863 'files': {864 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),865 },866 },867 },868 {869 'name': 'Files by filepath (multipart/form-data)',870 'arguments': {871 'url': TEST_BASE_URL,872 'files': {873 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),874 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),875 },876 },877 },878 {879 'name': 'File by filepath (multipart/form-data) wrapping',880 'arguments': {881 'url': TEST_BASE_URL,882 'files': {883 'param-1': os.path.join(884 TEMPDIR, '%s.ext' % ('foo' * 40),885 ),886 },887 },888 },889 {890 'name': (891 'Files by filepath (multipart/form-data)'892 ' with Content-Type'893 ),894 'arguments': {895 'url': TEST_BASE_URL,896 'files': {897 'param-1': (898 os.path.join(TEMPDIR, 'file-1.ext'),899 'text/plain',900 ),901 'param-2': (902 os.path.join(TEMPDIR, 'file-2.ext'),903 'text/csv',904 ),905 },906 },907 },908 {909 'name': (910 'File by filepath (multipart/form-data)'911 ' with Content-Type wrapping'912 ),913 'arguments': {914 'url': TEST_BASE_URL,915 'files': {916 'param-1': (917 os.path.join(TEMPDIR, 'file-1.ext'),918 'text/plain ' * 20,919 ),920 },921 },922 },923 {924 'name': (925 'File by filepath (multipart/form-data),'926 ' Content-Type, header'927 ),928 'arguments': {929 'url': TEST_BASE_URL,930 'files': {931 'param-1': (932 os.path.join(TEMPDIR, 'file-1.ext'),933 'text/plain',934 {'Accept-Language': 'es'},935 ),936 },937 },938 },939 {940 'name': (941 'File by filepath (multipart/form-data),'942 ' Content-Type, headers'943 ),944 'arguments': {945 'url': TEST_BASE_URL,946 'files': {947 'param-1': (948 os.path.join(TEMPDIR, 'file-1.ext'),949 'text/plain',950 {951 'Accept-Language': 'es',952 'Accept-Charset': 'utf-8',953 },954 ),955 },956 },957 },958 {959 'name': 'Files by filepath (multipart/form-data) + parameter',960 'arguments': {961 'url': TEST_BASE_URL,962 'files': {963 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),964 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),965 },966 'parameters': [967 {968 'name': 'param-1',969 'value': 'value-1',970 },971 ],972 },973 },974 {975 'name': 'Files by filepath (multipart/form-data) + parameters',976 'arguments': {977 'url': TEST_BASE_URL,978 'files': {979 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),980 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),981 },982 'parameters': [983 {984 'name': 'param-1',985 'value': 'value-1',986 },987 {988 'name': 'param-2',989 'value': 'value-2',990 },991 ],992 },993 },994 {995 'name': (996 'Files by filepath (multipart/form-data) + parameter'997 ' + header'998 ),999 'arguments': {1000 'url': TEST_BASE_URL,1001 'files': {1002 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1003 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1004 },1005 'parameters': [1006 {1007 'name': 'param-1',1008 'value': 'value-1',1009 },1010 ],1011 'headers': {1012 'Accept-Language': 'fr',1013 },1014 },1015 },1016 {1017 'name': (1018 'Files by filepath (multipart/form-data) + parameter'1019 ' + headers'1020 ),1021 'arguments': {1022 'url': TEST_BASE_URL,1023 'files': {1024 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1025 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1026 },1027 'parameters': [1028 {1029 'name': 'param-1',1030 'value': 'value-1',1031 },1032 ],1033 'headers': {1034 'Accept-Language': 'fr',1035 'Accept-Charset': 'utf-8',1036 },1037 },1038 },1039 {1040 'name': (1041 'Files by filepath (multipart/form-data) + parameters'1042 ' + header'1043 ),1044 'arguments': {1045 'url': TEST_BASE_URL,1046 'files': {1047 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1048 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1049 },1050 'parameters': [1051 {1052 'name': 'param-1',1053 'value': 'value-1',1054 },1055 {1056 'name': 'param-2',1057 'value': 'value-2',1058 },1059 ],1060 'headers': {1061 'Accept-Language': 'es',1062 },1063 },1064 },1065 {1066 'name': (1067 'Files by filepath (multipart/form-data) + parameters'1068 ' + headers'1069 ),1070 'arguments': {1071 'url': TEST_BASE_URL,1072 'files': {1073 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1074 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1075 },1076 'parameters': [1077 {1078 'name': 'param-1',1079 'value': 'value-1',1080 },1081 {1082 'name': 'param-2',1083 'value': 'value-2',1084 },1085 ],1086 'headers': {1087 'Accept-Language': 'fr',1088 'Accept-Charset': 'utf-8',1089 },1090 },1091 },1092 {1093 'name': (1094 'Files by filepath (multipart/form-data) + parameter'1095 ' + header + kwarg'1096 ),1097 'arguments': {1098 'url': TEST_BASE_URL,1099 'files': {1100 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1101 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1102 },1103 'parameters': [1104 {1105 'name': 'param-1',1106 'value': 'value-1',1107 },1108 ],1109 'headers': {1110 'Accept-Language': 'fr',1111 },1112 'kwargs': {1113 'timeout': 10,1114 },1115 },1116 },1117 {1118 'name': (1119 'Files by filepath (multipart/form-data) + parameter'1120 ' + headers + kwarg'1121 ),1122 'arguments': {1123 'url': TEST_BASE_URL,1124 'files': {1125 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1126 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1127 },1128 'parameters': [1129 {1130 'name': 'param-1',1131 'value': 'value-1',1132 },1133 ],1134 'headers': {1135 'Accept-Language': 'fr',1136 'Accept-Charset': 'utf-8',1137 },1138 'kwargs': {1139 'timeout': 10,1140 },1141 },1142 },1143 {1144 'name': (1145 'Files by filepath (multipart/form-data) + parameters'1146 ' + header + kwarg'1147 ),1148 'arguments': {1149 'url': TEST_BASE_URL,1150 'files': {1151 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1152 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1153 },1154 'parameters': [1155 {1156 'name': 'param-1',1157 'value': 'value-1',1158 },1159 {1160 'name': 'param-2',1161 'value': 'value-2',1162 },1163 ],1164 'headers': {1165 'Accept-Language': 'fr',1166 },1167 'kwargs': {1168 'timeout': 10,1169 },1170 },1171 },1172 {1173 'name': (1174 'Files by filepath (multipart/form-data) + parameters'1175 ' + headers + kwarg'1176 ),1177 'arguments': {1178 'url': TEST_BASE_URL,1179 'files': {1180 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1181 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1182 },1183 'parameters': [1184 {1185 'name': 'param-1',1186 'value': 'value-1',1187 },1188 {1189 'name': 'param-2',1190 'value': 'value-2',1191 },1192 ],1193 'headers': {1194 'Accept-Language': 'fr',1195 'Accept-Charset': 'utf-8',1196 },1197 'kwargs': {1198 'timeout': 10,1199 },1200 },1201 },1202 {1203 'name': (1204 'Files by filepath (multipart/form-data) + parameter'1205 ' + header + kwargs'1206 ),1207 'arguments': {1208 'url': TEST_BASE_URL,1209 'files': {1210 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1211 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1212 },1213 'parameters': [1214 {1215 'name': 'param-1',1216 'value': 'value-1',1217 },1218 ],1219 'headers': {1220 'Accept-Language': 'fr',1221 },1222 'kwargs': {1223 'timeout': 10,1224 'cookies': {1225 'hello': 'world',1226 },1227 },1228 },1229 },1230 {1231 'name': (1232 'Files by filepath (multipart/form-data) + parameter'1233 ' + headers + kwargs'1234 ),1235 'arguments': {1236 'url': TEST_BASE_URL,1237 'files': {1238 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1239 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1240 },1241 'parameters': [1242 {1243 'name': 'param-1',1244 'value': 'value-1',1245 },1246 ],1247 'headers': {1248 'Accept-Language': 'fr',1249 'Accept-Charset': 'utf-8',1250 },1251 'kwargs': {1252 'timeout': 10,1253 'stream': False,1254 },1255 },1256 },1257 {1258 'name': (1259 'Files by filepath (multipart/form-data) + parameters'1260 ' + header + kwargs'1261 ),1262 'arguments': {1263 'url': TEST_BASE_URL,1264 'files': {1265 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1266 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1267 },1268 'parameters': [1269 {1270 'name': 'param-1',1271 'value': 'value-1',1272 },1273 {1274 'name': 'param-2',1275 'value': 'value-2',1276 },1277 ],1278 'headers': {1279 'Accept-Language': 'fr',1280 },1281 'kwargs': {1282 'timeout': 10,1283 'stream': False,1284 },1285 },1286 },1287 {1288 'name': (1289 'Files by filepath (multipart/form-data) + parameters'1290 ' + headers + kwargs'1291 ),1292 'arguments': {1293 'url': TEST_BASE_URL,1294 'files': {1295 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1296 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1297 },1298 'parameters': [1299 {1300 'name': 'param-1',1301 'value': 'value-1',1302 },1303 {1304 'name': 'param-2',1305 'value': 'value-2',1306 },1307 ],1308 'headers': {1309 'Accept-Language': 'fr',1310 'Accept-Charset': 'utf-8',1311 },1312 'kwargs': {1313 'timeout': 10,1314 'stream': False,1315 },1316 },1317 },1318 {1319 'name': (1320 'No setup + files by filepath (multipart/form-data)'1321 ' + parameters + headers + kwargs + '1322 ),1323 'arguments': {1324 'url': TEST_BASE_URL,1325 'setup': False,1326 'files': {1327 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1328 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1329 },1330 'parameters': [1331 {1332 'name': 'param-1',1333 'value': 'value-1',1334 },1335 {1336 'name': 'param-2',1337 'value': 'value-2',1338 },1339 ],1340 'headers': {1341 'Accept-Language': 'fr',1342 'Accept-Charset': 'utf-8',1343 },1344 'kwargs': {1345 'timeout': 10,1346 'stream': False,1347 },1348 },1349 },1350 ])1351 if include_filenames:1352 for index, args_group in enumerate(response):1353 fname = argument_combination_to_filename(1354 args_group['name'], index,1355 )1356 if dirpath and os.path.exists(dirpath):1357 fname = os.path.join(dirpath, fname)1358 args_group['filename'] = fname...
test_logoscraper.py
Source:test_logoscraper.py
1from bs4 import BeautifulSoup2import scraper.logoscraper as logoscraper3test_base_url = "https://www.testbase.com"4def test_get_logo_should_pass():5 htmls = [6 f"<div class='logo'><img src='https://www.test.com'></img></div>",7 f"<div id='logo'><img src='https://www.test.com'></img></div>",8 f"<a><img src='https://www.test.com'></img></a>",9 f"<div><img src='https://www.test.com'></img></div>",10 f"<a href={test_base_url}><img src='https://www.test.com'></img></div>",11 ]12 for html in htmls:13 assert (14 logoscraper.get_logo(BeautifulSoup(html, "html.parser"), test_base_url)15 == "https://www.test.com"16 )17def test_find_image_tag():18 result = logoscraper.find_img_tag(19 BeautifulSoup(20 f'<a><img src="https://{test_base_url}"></img></a>', "html.parser"21 ),22 test_base_url,23 )24 assert result25def test_find_image_tag_return_itself():26 result = logoscraper.find_img_tag(27 BeautifulSoup(f'<img src="https://{test_base_url}"></img>', "html.parser"),28 test_base_url,29 )30 assert result31def test_find_image_tag_retrun_none():32 result = logoscraper.find_img_tag(33 BeautifulSoup(f'<a href="https://{test_base_url}"></a>', "html.parser"),34 test_base_url,35 )36 assert result is None37def test_format_image_source():38 html = BeautifulSoup(f"<img src='{test_base_url}'></img>", "html.parser").find(39 "img"40 )41 assert logoscraper.format_image_source(html, test_base_url) == test_base_url42def test_format_image_source_no_source():43 html = BeautifulSoup(f"<img></img>", "html.parser").find("img")44 assert logoscraper.format_image_source(html, test_base_url) is None45def test_format_image_source_relative_path():46 html = BeautifulSoup(47 f"<img src='resources/images/image.png'></img>", "html.parser"48 ).find("img")49 assert (50 logoscraper.format_image_source(html, test_base_url)51 == f"{test_base_url}/resources/images/image.png"...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!