Skip to content

Commit db992ca

Browse files
gh-138907: Support RFC 9309 in robotparser
1 parent e0f54a6 commit db992ca

File tree

2 files changed

+428
-102
lines changed

2 files changed

+428
-102
lines changed

Lib/test/test_robotparser.py

Lines changed: 288 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,18 @@ class BaseRobotTest:
1515
good = []
1616
bad = []
1717
site_maps = None
18+
expected_output = None
1819

1920
def __init_subclass__(cls):
2021
super().__init_subclass__()
2122
# Remove tests that do nothing.
22-
if not cls.good:
23-
cls.test_good_urls = None
24-
if not cls.bad:
25-
cls.test_bad_urls = None
23+
if issubclass(cls, unittest.TestCase):
24+
if not cls.good:
25+
cls.test_good_urls = None
26+
if not cls.bad:
27+
cls.test_bad_urls = None
28+
if cls.expected_output is None:
29+
cls.test_string_formatting = None
2630

2731
def setUp(self):
2832
lines = io.StringIO(self.robots_txt).readlines()
@@ -50,6 +54,8 @@ def test_bad_urls(self):
5054
def test_site_maps(self):
5155
self.assertEqual(self.parser.site_maps(), self.site_maps)
5256

57+
def test_string_formatting(self):
58+
self.assertEqual(str(self.parser), self.expected_output)
5359

5460
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
5561
robots_txt = """\
@@ -61,6 +67,56 @@ class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
6167
good = ['/', '/test.html']
6268
bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
6369

70+
class SimpleExampleTest(BaseRobotTest, unittest.TestCase):
71+
# Example from RFC 9309, section 5.1.
72+
robots_txt = """\
73+
User-Agent: *
74+
Disallow: *.gif$
75+
Disallow: /example/
76+
Allow: /publications/
77+
78+
User-Agent: foobot
79+
Disallow:/
80+
Allow:/example/page.html
81+
Allow:/example/allowed.gif
82+
83+
User-Agent: barbot
84+
User-Agent: bazbot
85+
Disallow: /example/page.html
86+
87+
User-Agent: quxbot
88+
"""
89+
good = [
90+
'/', '/publications/',
91+
('foobot', '/example/page.html'), ('foobot', '/example/allowed.gif'),
92+
('barbot', '/'), ('barbot', '/example/'),
93+
('barbot', '/example/allowed.gif'),
94+
('barbot', '/example/disallowed.gif'),
95+
('barbot', '/publications/'),
96+
('barbot', '/publications/allowed.gif'),
97+
('bazbot', '/'), ('bazbot', '/example/'),
98+
('bazbot', '/example/allowed.gif'),
99+
('bazbot', '/example/disallowed.gif'),
100+
('bazbot', '/publications/'),
101+
('bazbot', '/publications/allowed.gif'),
102+
('quxbot', '/'), ('quxbot', '/example/'),
103+
('quxbot', '/example/page.html'), ('quxbot', '/example/allowed.gif'),
104+
('quxbot', '/example/disallowed.gif'),
105+
('quxbot', '/publications/'),
106+
('quxbot', '/publications/allowed.gif'),
107+
]
108+
bad = [
109+
'/example/', '/example/page.html', '/example/allowed.gif',
110+
'/example/disallowed.gif',
111+
'/publications/allowed.gif',
112+
('foobot', '/'), ('foobot', '/example/'),
113+
('foobot', '/example/disallowed.gif'),
114+
('foobot', '/publications/'),
115+
('foobot', '/publications/allowed.gif'),
116+
('barbot', '/example/page.html'),
117+
('bazbot', '/example/page.html'),
118+
]
119+
64120

65121
class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
66122
robots_txt = """\
@@ -137,6 +193,7 @@ def test_request_rate(self):
137193
class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
138194
robots_txt = ''
139195
good = ['/foo']
196+
expected_output = ''
140197

141198

142199
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
@@ -221,17 +278,185 @@ class UserAgentGoogleMobileTest(UserAgentOrderingTest):
221278
agent = 'Googlebot-Mobile'
222279

223280

224-
class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
225-
# Google also got the order wrong. You need
226-
# to specify the URLs from more specific to more general
281+
class LongestMatchTest(BaseRobotTest, unittest.TestCase):
282+
# Based on example from RFC 9309, section 5.2.
227283
robots_txt = """\
228-
User-agent: Googlebot
229-
Allow: /folder1/myfile.html
230-
Disallow: /folder1/
284+
User-agent: *
285+
Allow: /example/page/
286+
Disallow: /example/page/disallowed.gif
287+
Allow: /example/
231288
"""
232-
agent = 'googlebot'
233-
good = ['/folder1/myfile.html']
234-
bad = ['/folder1/anotherfile.html']
289+
good = ['/example/', '/example/page/']
290+
bad = ['/example/page/disallowed.gif']
291+
292+
293+
class LongestMatchWildcardTest(BaseRobotTest, unittest.TestCase):
294+
robots_txt = """\
295+
User-agent: *
296+
Allow: /example/page/
297+
Disallow: *.gif
298+
Allow: /example/
299+
"""
300+
good = ['/example/', '/example/page/']
301+
bad = ['/example/page/disallowed.gif', '/x.gif']
302+
303+
304+
class AllowWinsEqualMatchTest(BaseRobotTest, unittest.TestCase):
305+
robots_txt = """\
306+
User-agent: *
307+
Disallow: /spam
308+
Allow: /spam
309+
Disallow: /spam
310+
"""
311+
good = ['/spam', '/spam/']
312+
313+
314+
class AllowWinsEqualFullMatchTest(BaseRobotTest, unittest.TestCase):
315+
robots_txt = """\
316+
User-agent: *
317+
Disallow: /spam
318+
Allow: /spam$
319+
Disallow: /spam
320+
Disallow: /eggs$
321+
Allow: /eggs
322+
Disallow: /eggs$
323+
"""
324+
good = ['/spam', '/eggs', '/eggs/']
325+
bad = ['/spam/']
326+
327+
328+
class AllowWinsEqualMatchWildcardTest(BaseRobotTest, unittest.TestCase):
329+
robots_txt = """\
330+
User-agent: *
331+
Disallow: /spam
332+
Allow: *am
333+
Disallow: /spam
334+
Disallow: *gs
335+
Allow: /eggs
336+
Disallow: *gs
337+
"""
338+
good = ['/spam', '/eggs', '/spam/', '/eggs/']
339+
340+
341+
class MergeGroupsTest(BaseRobotTest, unittest.TestCase):
342+
robots_txt = """\
343+
User-agent: spambot
344+
Disallow: /some/path
345+
346+
User-agent: spambot
347+
Disallow: /another/path
348+
"""
349+
agent = 'spambot'
350+
bad = ['/some/path', '/another/path']
351+
352+
353+
class UserAgentStartsGroupTest(BaseRobotTest, unittest.TestCase):
354+
robots_txt = """\
355+
User-agent: spambot
356+
Disallow: /some/path
357+
User-agent: eggsbot
358+
Disallow: /another/path
359+
"""
360+
good = [('spambot', '/'), ('spambot', '/another/path'),
361+
('eggsbot', '/'), ('eggsbot', '/some/path')]
362+
bad = [('spambot', '/some/path'), ('eggsbot', '/another/path')]
363+
expected_output = """\
364+
User-agent: spambot
365+
Disallow: /some/path
366+
367+
User-agent: eggsbot
368+
Disallow: /another/path\
369+
"""
370+
371+
class IgnoreEmptyLinesTest(BaseRobotTest, unittest.TestCase):
372+
robots_txt = """\
373+
User-agent: spambot
374+
375+
User-agent: eggsbot
376+
Disallow: /some/path
377+
378+
Disallow: /another/path
379+
"""
380+
good = [('spambot', '/'), ('eggsbot', '/')]
381+
bad = [
382+
('spambot', '/some/path'), ('spambot', '/another/path'),
383+
('eggsbot', '/some/path'), ('eggsbot', '/another/path'),
384+
]
385+
expected_output = """\
386+
User-agent: spambot
387+
User-agent: eggsbot
388+
Disallow: /another/path
389+
Disallow: /some/path\
390+
"""
391+
392+
393+
class IgnoreRulesWithoutUserAgentTest(BaseRobotTest, unittest.TestCase):
394+
robots_txt = """\
395+
Disallow: /some/path
396+
397+
User-agent: *
398+
Disallow: /another/path
399+
"""
400+
good = ['/', '/some/path']
401+
bad = ['/another/path']
402+
expected_output = """\
403+
User-agent: *
404+
Disallow: /another/path\
405+
"""
406+
407+
408+
class EmptyGroupTest(BaseRobotTest, unittest.TestCase):
409+
robots_txt = """\
410+
User-agent: *
411+
Disallow: /some/path
412+
413+
User-agent: spambot
414+
"""
415+
agent = 'spambot'
416+
good = ['/', '/some/path']
417+
expected_output = """\
418+
User-agent: *
419+
Disallow: /some/path
420+
421+
User-agent: spambot
422+
Allow:\
423+
"""
424+
425+
426+
class WeirdPathTest(BaseRobotTest, unittest.TestCase):
427+
robots_txt = f"""\
428+
User-agent: *
429+
Disallow: /a$$$
430+
Disallow: /b$z
431+
Disallow: /c***
432+
Disallow: /d***z
433+
Disallow: /e*$**$$
434+
Disallow: /f*$**$$z
435+
Disallow: /g$*$$**
436+
Disallow: /h$*$$**z
437+
"""
438+
good = ['/b', '/bz', '/ax', '/d', '/f', '/fz', '/gx', '/h', '/hz']
439+
bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy', '/g']
440+
expected_output = """\
441+
User-agent: *
442+
Disallow: /c*
443+
Disallow: /d*z
444+
Disallow: /e*$
445+
Disallow: /a$
446+
Disallow: /g$\
447+
"""
448+
449+
450+
class PathWithManyWildcardsTest(BaseRobotTest, unittest.TestCase):
451+
# This test would take many years if use naive translation to regular
452+
# expression (* -> .*).
453+
N = 50
454+
robots_txt = f"""\
455+
User-agent: *
456+
Disallow: /{'*a'*N}*b
457+
"""
458+
good = ['/' + 'a'*N + 'a']
459+
bad = ['/' + 'a'*N + 'b']
235460

236461

237462
class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
@@ -251,19 +476,6 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
251476
'/yet/one/path?name=value&more']
252477

253478

254-
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
255-
# obey first * entry (#4108)
256-
robots_txt = """\
257-
User-agent: *
258-
Disallow: /some/path
259-
260-
User-agent: *
261-
Disallow: /another/path
262-
"""
263-
good = ['/another/path']
264-
bad = ['/some/path']
265-
266-
267479
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
268480
robots_txt = """\
269481
User-agent: *
@@ -365,17 +577,60 @@ class StringFormattingTest(BaseRobotTest, unittest.TestCase):
365577
"""
366578

367579
expected_output = """\
368-
User-agent: cybermapper
369-
Disallow: /some/path
370-
371580
User-agent: *
372581
Crawl-delay: 1
373582
Request-rate: 3/15
374-
Disallow: /cyberworld/map/\
583+
Disallow: /cyberworld/map/
584+
585+
User-agent: cybermapper
586+
Disallow: /some/path\
375587
"""
376588

377-
def test_string_formatting(self):
378-
self.assertEqual(str(self.parser), self.expected_output)
589+
590+
class ConstructedStringFormattingTest(unittest.TestCase):
591+
def test_empty(self):
592+
parser = urllib.robotparser.RobotFileParser()
593+
self.assertEqual(str(parser), '')
594+
595+
def test_group_without_rules(self):
596+
parser = urllib.robotparser.RobotFileParser()
597+
entry = urllib.robotparser.Entry()
598+
entry.useragents = ['spambot']
599+
parser._add_entry(entry)
600+
entry = urllib.robotparser.Entry()
601+
entry.useragents = ['hambot']
602+
entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
603+
parser._add_entry(entry)
604+
entry = urllib.robotparser.Entry()
605+
entry.useragents = ['eggsbot']
606+
parser._add_entry(entry)
607+
self.assertEqual(str(parser), """\
608+
User-agent: spambot
609+
Allow:
610+
611+
User-agent: hambot
612+
Disallow: /ham
613+
614+
User-agent: eggsbot
615+
Allow:\
616+
""")
617+
618+
def test_group_without_user_agent(self):
619+
parser = urllib.robotparser.RobotFileParser()
620+
entry = urllib.robotparser.Entry()
621+
entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
622+
parser._add_entry(entry)
623+
entry = urllib.robotparser.Entry()
624+
entry.useragents = ['spambot']
625+
entry.rulelines = [urllib.robotparser.RuleLine('/spam', False)]
626+
parser._add_entry(entry)
627+
entry = urllib.robotparser.Entry()
628+
entry.rulelines = [urllib.robotparser.RuleLine('/eggs', False)]
629+
parser._add_entry(entry)
630+
self.assertEqual(str(parser), """\
631+
User-agent: spambot
632+
Disallow: /spam\
633+
""")
379634

380635

381636
@unittest.skipUnless(
@@ -495,7 +750,7 @@ def test_basic(self):
495750
def test_can_fetch(self):
496751
self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
497752
self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
498-
self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
753+
self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian')))
499754
self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
500755
self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
501756
self.assertTrue(self.parser.can_fetch('*', self.base_url))

0 commit comments

Comments
 (0)