@@ -15,14 +15,18 @@ class BaseRobotTest:
1515 good = []
1616 bad = []
1717 site_maps = None
18+ expected_output = None
1819
1920 def __init_subclass__ (cls ):
2021 super ().__init_subclass__ ()
2122 # Remove tests that do nothing.
22- if not cls .good :
23- cls .test_good_urls = None
24- if not cls .bad :
25- cls .test_bad_urls = None
23+ if issubclass (cls , unittest .TestCase ):
24+ if not cls .good :
25+ cls .test_good_urls = None
26+ if not cls .bad :
27+ cls .test_bad_urls = None
28+ if cls .expected_output is None :
29+ cls .test_string_formatting = None
2630
2731 def setUp (self ):
2832 lines = io .StringIO (self .robots_txt ).readlines ()
@@ -50,6 +54,8 @@ def test_bad_urls(self):
5054 def test_site_maps (self ):
5155 self .assertEqual (self .parser .site_maps (), self .site_maps )
5256
57+ def test_string_formatting (self ):
58+ self .assertEqual (str (self .parser ), self .expected_output )
5359
5460class UserAgentWildcardTest (BaseRobotTest , unittest .TestCase ):
5561 robots_txt = """\
@@ -61,6 +67,56 @@ class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
6167 good = ['/' , '/test.html' ]
6268 bad = ['/cyberworld/map/index.html' , '/tmp/xxx' , '/foo.html' ]
6369
70+ class SimpleExampleTest (BaseRobotTest , unittest .TestCase ):
71+ # Example from RFC 9309, section 5.1.
72+ robots_txt = """\
73+ User-Agent: *
74+ Disallow: *.gif$
75+ Disallow: /example/
76+ Allow: /publications/
77+
78+ User-Agent: foobot
79+ Disallow:/
80+ Allow:/example/page.html
81+ Allow:/example/allowed.gif
82+
83+ User-Agent: barbot
84+ User-Agent: bazbot
85+ Disallow: /example/page.html
86+
87+ User-Agent: quxbot
88+ """
89+ good = [
90+ '/' , '/publications/' ,
91+ ('foobot' , '/example/page.html' ), ('foobot' , '/example/allowed.gif' ),
92+ ('barbot' , '/' ), ('barbot' , '/example/' ),
93+ ('barbot' , '/example/allowed.gif' ),
94+ ('barbot' , '/example/disallowed.gif' ),
95+ ('barbot' , '/publications/' ),
96+ ('barbot' , '/publications/allowed.gif' ),
97+ ('bazbot' , '/' ), ('bazbot' , '/example/' ),
98+ ('bazbot' , '/example/allowed.gif' ),
99+ ('bazbot' , '/example/disallowed.gif' ),
100+ ('bazbot' , '/publications/' ),
101+ ('bazbot' , '/publications/allowed.gif' ),
102+ ('quxbot' , '/' ), ('quxbot' , '/example/' ),
103+ ('quxbot' , '/example/page.html' ), ('quxbot' , '/example/allowed.gif' ),
104+ ('quxbot' , '/example/disallowed.gif' ),
105+ ('quxbot' , '/publications/' ),
106+ ('quxbot' , '/publications/allowed.gif' ),
107+ ]
108+ bad = [
109+ '/example/' , '/example/page.html' , '/example/allowed.gif' ,
110+ '/example/disallowed.gif' ,
111+ '/publications/allowed.gif' ,
112+ ('foobot' , '/' ), ('foobot' , '/example/' ),
113+ ('foobot' , '/example/disallowed.gif' ),
114+ ('foobot' , '/publications/' ),
115+ ('foobot' , '/publications/allowed.gif' ),
116+ ('barbot' , '/example/page.html' ),
117+ ('bazbot' , '/example/page.html' ),
118+ ]
119+
64120
65121class CrawlDelayAndCustomAgentTest (BaseRobotTest , unittest .TestCase ):
66122 robots_txt = """\
@@ -137,6 +193,7 @@ def test_request_rate(self):
137193class EmptyFileTest (BaseRequestRateTest , unittest .TestCase ):
138194 robots_txt = ''
139195 good = ['/foo' ]
196+ expected_output = ''
140197
141198
142199class CrawlDelayAndRequestRateTest (BaseRequestRateTest , unittest .TestCase ):
@@ -221,17 +278,185 @@ class UserAgentGoogleMobileTest(UserAgentOrderingTest):
221278 agent = 'Googlebot-Mobile'
222279
223280
224- class GoogleURLOrderingTest (BaseRobotTest , unittest .TestCase ):
225- # Google also got the order wrong. You need
226- # to specify the URLs from more specific to more general
281+ class LongestMatchTest (BaseRobotTest , unittest .TestCase ):
282+ # Based on example from RFC 9309, section 5.2.
227283 robots_txt = """\
228- User-agent: Googlebot
229- Allow: /folder1/myfile.html
230- Disallow: /folder1/
284+ User-agent: *
285+ Allow: /example/page/
286+ Disallow: /example/page/disallowed.gif
287+ Allow: /example/
231288 """
232- agent = 'googlebot'
233- good = ['/folder1/myfile.html' ]
234- bad = ['/folder1/anotherfile.html' ]
289+ good = ['/example/' , '/example/page/' ]
290+ bad = ['/example/page/disallowed.gif' ]
291+
292+
293+ class LongestMatchWildcardTest (BaseRobotTest , unittest .TestCase ):
294+ robots_txt = """\
295+ User-agent: *
296+ Allow: /example/page/
297+ Disallow: *.gif
298+ Allow: /example/
299+ """
300+ good = ['/example/' , '/example/page/' ]
301+ bad = ['/example/page/disallowed.gif' , '/x.gif' ]
302+
303+
304+ class AllowWinsEqualMatchTest (BaseRobotTest , unittest .TestCase ):
305+ robots_txt = """\
306+ User-agent: *
307+ Disallow: /spam
308+ Allow: /spam
309+ Disallow: /spam
310+ """
311+ good = ['/spam' , '/spam/' ]
312+
313+
314+ class AllowWinsEqualFullMatchTest (BaseRobotTest , unittest .TestCase ):
315+ robots_txt = """\
316+ User-agent: *
317+ Disallow: /spam
318+ Allow: /spam$
319+ Disallow: /spam
320+ Disallow: /eggs$
321+ Allow: /eggs
322+ Disallow: /eggs$
323+ """
324+ good = ['/spam' , '/eggs' , '/eggs/' ]
325+ bad = ['/spam/' ]
326+
327+
328+ class AllowWinsEqualMatchWildcardTest (BaseRobotTest , unittest .TestCase ):
329+ robots_txt = """\
330+ User-agent: *
331+ Disallow: /spam
332+ Allow: *am
333+ Disallow: /spam
334+ Disallow: *gs
335+ Allow: /eggs
336+ Disallow: *gs
337+ """
338+ good = ['/spam' , '/eggs' , '/spam/' , '/eggs/' ]
339+
340+
341+ class MergeGroupsTest (BaseRobotTest , unittest .TestCase ):
342+ robots_txt = """\
343+ User-agent: spambot
344+ Disallow: /some/path
345+
346+ User-agent: spambot
347+ Disallow: /another/path
348+ """
349+ agent = 'spambot'
350+ bad = ['/some/path' , '/another/path' ]
351+
352+
353+ class UserAgentStartsGroupTest (BaseRobotTest , unittest .TestCase ):
354+ robots_txt = """\
355+ User-agent: spambot
356+ Disallow: /some/path
357+ User-agent: eggsbot
358+ Disallow: /another/path
359+ """
360+ good = [('spambot' , '/' ), ('spambot' , '/another/path' ),
361+ ('eggsbot' , '/' ), ('eggsbot' , '/some/path' )]
362+ bad = [('spambot' , '/some/path' ), ('eggsbot' , '/another/path' )]
363+ expected_output = """\
364+ User-agent: spambot
365+ Disallow: /some/path
366+
367+ User-agent: eggsbot
368+ Disallow: /another/path\
369+ """
370+
371+ class IgnoreEmptyLinesTest (BaseRobotTest , unittest .TestCase ):
372+ robots_txt = """\
373+ User-agent: spambot
374+
375+ User-agent: eggsbot
376+ Disallow: /some/path
377+
378+ Disallow: /another/path
379+ """
380+ good = [('spambot' , '/' ), ('eggsbot' , '/' )]
381+ bad = [
382+ ('spambot' , '/some/path' ), ('spambot' , '/another/path' ),
383+ ('eggsbot' , '/some/path' ), ('eggsbot' , '/another/path' ),
384+ ]
385+ expected_output = """\
386+ User-agent: spambot
387+ User-agent: eggsbot
388+ Disallow: /another/path
389+ Disallow: /some/path\
390+ """
391+
392+
393+ class IgnoreRulesWithoutUserAgentTest (BaseRobotTest , unittest .TestCase ):
394+ robots_txt = """\
395+ Disallow: /some/path
396+
397+ User-agent: *
398+ Disallow: /another/path
399+ """
400+ good = ['/' , '/some/path' ]
401+ bad = ['/another/path' ]
402+ expected_output = """\
403+ User-agent: *
404+ Disallow: /another/path\
405+ """
406+
407+
408+ class EmptyGroupTest (BaseRobotTest , unittest .TestCase ):
409+ robots_txt = """\
410+ User-agent: *
411+ Disallow: /some/path
412+
413+ User-agent: spambot
414+ """
415+ agent = 'spambot'
416+ good = ['/' , '/some/path' ]
417+ expected_output = """\
418+ User-agent: *
419+ Disallow: /some/path
420+
421+ User-agent: spambot
422+ Allow:\
423+ """
424+
425+
426+ class WeirdPathTest (BaseRobotTest , unittest .TestCase ):
427+ robots_txt = f"""\
428+ User-agent: *
429+ Disallow: /a$$$
430+ Disallow: /b$z
431+ Disallow: /c***
432+ Disallow: /d***z
433+ Disallow: /e*$**$$
434+ Disallow: /f*$**$$z
435+ Disallow: /g$*$$**
436+ Disallow: /h$*$$**z
437+ """
438+ good = ['/b' , '/bz' , '/ax' , '/d' , '/f' , '/fz' , '/gx' , '/h' , '/hz' ]
439+ bad = ['/a' , '/c' , '/cxy' , '/dz' , '/dxyz' , '/dxzy' , '/e' , '/exy' , '/g' ]
440+ expected_output = """\
441+ User-agent: *
442+ Disallow: /c*
443+ Disallow: /d*z
444+ Disallow: /e*$
445+ Disallow: /a$
446+ Disallow: /g$\
447+ """
448+
449+
450+ class PathWithManyWildcardsTest (BaseRobotTest , unittest .TestCase ):
451+ # This test would take many years if use naive translation to regular
452+ # expression (* -> .*).
453+ N = 50
454+ robots_txt = f"""\
455+ User-agent: *
456+ Disallow: /{ '*a' * N } *b
457+ """
458+ good = ['/' + 'a' * N + 'a' ]
459+ bad = ['/' + 'a' * N + 'b' ]
235460
236461
237462class DisallowQueryStringTest (BaseRobotTest , unittest .TestCase ):
@@ -251,19 +476,6 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
251476 '/yet/one/path?name=value&more' ]
252477
253478
254- class UseFirstUserAgentWildcardTest (BaseRobotTest , unittest .TestCase ):
255- # obey first * entry (#4108)
256- robots_txt = """\
257- User-agent: *
258- Disallow: /some/path
259-
260- User-agent: *
261- Disallow: /another/path
262- """
263- good = ['/another/path' ]
264- bad = ['/some/path' ]
265-
266-
267479class PercentEncodingTest (BaseRobotTest , unittest .TestCase ):
268480 robots_txt = """\
269481 User-agent: *
@@ -365,17 +577,60 @@ class StringFormattingTest(BaseRobotTest, unittest.TestCase):
365577 """
366578
367579 expected_output = """\
368- User-agent: cybermapper
369- Disallow: /some/path
370-
371580 User-agent: *
372581Crawl-delay: 1
373582Request-rate: 3/15
374- Disallow: /cyberworld/map/\
583+ Disallow: /cyberworld/map/
584+
585+ User-agent: cybermapper
586+ Disallow: /some/path\
375587 """
376588
377- def test_string_formatting (self ):
378- self .assertEqual (str (self .parser ), self .expected_output )
589+
590+ class ConstructedStringFormattingTest (unittest .TestCase ):
591+ def test_empty (self ):
592+ parser = urllib .robotparser .RobotFileParser ()
593+ self .assertEqual (str (parser ), '' )
594+
595+ def test_group_without_rules (self ):
596+ parser = urllib .robotparser .RobotFileParser ()
597+ entry = urllib .robotparser .Entry ()
598+ entry .useragents = ['spambot' ]
599+ parser ._add_entry (entry )
600+ entry = urllib .robotparser .Entry ()
601+ entry .useragents = ['hambot' ]
602+ entry .rulelines = [urllib .robotparser .RuleLine ('/ham' , False )]
603+ parser ._add_entry (entry )
604+ entry = urllib .robotparser .Entry ()
605+ entry .useragents = ['eggsbot' ]
606+ parser ._add_entry (entry )
607+ self .assertEqual (str (parser ), """\
608+ User-agent: spambot
609+ Allow:
610+
611+ User-agent: hambot
612+ Disallow: /ham
613+
614+ User-agent: eggsbot
615+ Allow:\
616+ """ )
617+
618+ def test_group_without_user_agent (self ):
619+ parser = urllib .robotparser .RobotFileParser ()
620+ entry = urllib .robotparser .Entry ()
621+ entry .rulelines = [urllib .robotparser .RuleLine ('/ham' , False )]
622+ parser ._add_entry (entry )
623+ entry = urllib .robotparser .Entry ()
624+ entry .useragents = ['spambot' ]
625+ entry .rulelines = [urllib .robotparser .RuleLine ('/spam' , False )]
626+ parser ._add_entry (entry )
627+ entry = urllib .robotparser .Entry ()
628+ entry .rulelines = [urllib .robotparser .RuleLine ('/eggs' , False )]
629+ parser ._add_entry (entry )
630+ self .assertEqual (str (parser ), """\
631+ User-agent: spambot
632+ Disallow: /spam\
633+ """ )
379634
380635
381636@unittest .skipUnless (
@@ -495,7 +750,7 @@ def test_basic(self):
495750 def test_can_fetch (self ):
496751 self .assertTrue (self .parser .can_fetch ('*' , self .url ('elsewhere' )))
497752 self .assertFalse (self .parser .can_fetch ('Nutch' , self .base_url ))
498- self .assertFalse (self .parser .can_fetch ('Nutch' , self .url ('brian' )))
753+ self .assertTrue (self .parser .can_fetch ('Nutch' , self .url ('brian' )))
499754 self .assertFalse (self .parser .can_fetch ('Nutch' , self .url ('webstats' )))
500755 self .assertFalse (self .parser .can_fetch ('*' , self .url ('webstats' )))
501756 self .assertTrue (self .parser .can_fetch ('*' , self .base_url ))
0 commit comments