Skip to content

Commit cb7ef18

Browse files
gh-88375, gh-111788: Fix parsing errors and normalization in robotparser (GH-138502)
* Don't fail trying to parse weird patterns. * Don't fail trying to decode non-UTF-8 "robots.txt" files. * No longer ignore trailing "?" in patterns and URLs. * Distinguish raw special characters "?", "=" and "&" from the percent-encoded ones. * Remove tests that do nothing.
1 parent ed522ed commit cb7ef18

File tree

4 files changed

+172
-31
lines changed

4 files changed

+172
-31
lines changed

Lib/test/test_robotparser.py

Lines changed: 141 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ class BaseRobotTest:
1616
bad = []
1717
site_maps = None
1818

19+
def __init_subclass__(cls):
20+
super().__init_subclass__()
21+
# Remove tests that do nothing.
22+
if not cls.good:
23+
cls.test_good_urls = None
24+
if not cls.bad:
25+
cls.test_bad_urls = None
26+
1927
def setUp(self):
2028
lines = io.StringIO(self.robots_txt).readlines()
2129
self.parser = urllib.robotparser.RobotFileParser()
@@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
231239
robots_txt = """\
232240
User-agent: *
233241
Disallow: /some/path?name=value
242+
Disallow: /another/path?
243+
Disallow: /yet/one/path?name=value&more
234244
"""
235-
good = ['/some/path']
236-
bad = ['/some/path?name=value']
245+
good = ['/some/path', '/some/path?',
246+
'/some/path%3Fname=value', '/some/path?name%3Dvalue',
247+
'/another/path', '/another/path%3F',
248+
'/yet/one/path?name=value%26more']
249+
bad = ['/some/path?name=value'
250+
'/another/path?', '/another/path?name=value',
251+
'/yet/one/path?name=value&more']
237252

238253

239254
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
@@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
249264
bad = ['/some/path']
250265

251266

252-
class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
253-
# normalize the URL first (#17403)
267+
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
254268
robots_txt = """\
255269
User-agent: *
256-
Allow: /some/path?
257-
Disallow: /another/path?
258-
"""
259-
good = ['/some/path?']
260-
bad = ['/another/path?']
270+
Disallow: /a1/Z-._~ # unreserved characters
271+
Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
272+
Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
273+
Disallow: /u2/%f0%9f%90%8d
274+
Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
275+
Disallow: /v1/%F0 # percent-encoded non-ASCII octet
276+
Disallow: /v2/%f0
277+
Disallow: /v3/\udcf0 # raw non-ASCII octet
278+
Disallow: /p1%xy # raw percent
279+
Disallow: /p2%
280+
Disallow: /p3%25xy # percent-encoded percent
281+
Disallow: /p4%2525xy # double percent-encoded percent
282+
Disallow: /john%20smith # space
283+
Disallow: /john doe
284+
Disallow: /trailingspace%20
285+
Disallow: /question%3Fq=v # not query
286+
Disallow: /hash%23f # not fragment
287+
Disallow: /dollar%24
288+
Disallow: /asterisk%2A
289+
Disallow: /sub/dir
290+
Disallow: /slash%2F
291+
Disallow: /query/question?q=%3F
292+
Disallow: /query/raw/question?q=?
293+
Disallow: /query/eq?q%3Dv
294+
Disallow: /query/amp?q=v%26a
295+
"""
296+
good = [
297+
'/u1/%F0', '/u1/%f0',
298+
'/u2/%F0', '/u2/%f0',
299+
'/u3/%F0', '/u3/%f0',
300+
'/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
301+
'/question?q=v',
302+
'/dollar', '/asterisk',
303+
'/query/eq?q=v',
304+
'/query/amp?q=v&a',
305+
]
306+
bad = [
307+
'/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
308+
'/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
309+
'/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
310+
'/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
311+
'/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
312+
'/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
313+
'/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
314+
'/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
315+
'/p1%xy', '/p1%25xy',
316+
'/p2%', '/p2%25', '/p2%2525', '/p2%xy',
317+
'/p3%xy', '/p3%25xy',
318+
'/p4%2525xy',
319+
'/john%20smith', '/john smith',
320+
'/john%20doe', '/john doe',
321+
'/trailingspace%20', '/trailingspace ',
322+
'/question%3Fq=v',
323+
'/hash#f', '/hash%23f',
324+
'/dollar$', '/dollar%24',
325+
'/asterisk*', '/asterisk%2A',
326+
'/sub/dir', '/sub%2Fdir',
327+
'/slash%2F', '/slash/',
328+
'/query/question?q=?', '/query/question?q=%3F',
329+
'/query/raw/question?q=?', '/query/raw/question?q=%3F',
330+
'/query/eq?q%3Dv',
331+
'/query/amp?q=v%26a',
332+
]
333+
# other reserved characters
334+
for c in ":/#[]@!$&'()*+,;=":
335+
robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
336+
bad.append(f'/raw{c}')
337+
bad.append(f'/raw%{ord(c):02X}')
338+
bad.append(f'/pc{c}')
339+
bad.append(f'/pc%{ord(c):02X}')
261340

262341

263342
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
@@ -299,26 +378,17 @@ def test_string_formatting(self):
299378
self.assertEqual(str(self.parser), self.expected_output)
300379

301380

302-
class RobotHandler(BaseHTTPRequestHandler):
303-
304-
def do_GET(self):
305-
self.send_error(403, "Forbidden access")
306-
307-
def log_message(self, format, *args):
308-
pass
309-
310-
311381
@unittest.skipUnless(
312382
support.has_socket_support,
313383
"Socket server requires working socket."
314384
)
315-
class PasswordProtectedSiteTestCase(unittest.TestCase):
385+
class BaseLocalNetworkTestCase:
316386

317387
def setUp(self):
318388
# clear _opener global variable
319389
self.addCleanup(urllib.request.urlcleanup)
320390

321-
self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
391+
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
322392

323393
self.t = threading.Thread(
324394
name='HTTPServer serving',
@@ -335,6 +405,57 @@ def tearDown(self):
335405
self.t.join()
336406
self.server.server_close()
337407

408+
409+
SAMPLE_ROBOTS_TXT = b'''\
410+
User-agent: test_robotparser
411+
Disallow: /utf8/\xf0\x9f\x90\x8d
412+
Disallow: /non-utf8/\xf0
413+
Disallow: //[spam]/path
414+
'''
415+
416+
417+
class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
418+
class RobotHandler(BaseHTTPRequestHandler):
419+
420+
def do_GET(self):
421+
self.send_response(200)
422+
self.end_headers()
423+
self.wfile.write(SAMPLE_ROBOTS_TXT)
424+
425+
def log_message(self, format, *args):
426+
pass
427+
428+
@threading_helper.reap_threads
429+
def testRead(self):
430+
# Test that reading a weird robots.txt doesn't fail.
431+
addr = self.server.server_address
432+
url = f'http://{socket_helper.HOST}:{addr[1]}'
433+
robots_url = url + '/robots.txt'
434+
parser = urllib.robotparser.RobotFileParser()
435+
parser.set_url(robots_url)
436+
parser.read()
437+
# And it can even interpret the weird paths in some reasonable way.
438+
agent = 'test_robotparser'
439+
self.assertTrue(parser.can_fetch(agent, robots_url))
440+
self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
441+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
442+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
443+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
444+
self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
445+
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
446+
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
447+
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
448+
449+
450+
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
451+
class RobotHandler(BaseHTTPRequestHandler):
452+
453+
def do_GET(self):
454+
self.send_error(403, "Forbidden access")
455+
456+
def log_message(self, format, *args):
457+
pass
458+
338459
@threading_helper.reap_threads
339460
def testPasswordProtectedSite(self):
340461
addr = self.server.server_address

Lib/urllib/robotparser.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"""
1212

1313
import collections
14+
import re
1415
import urllib.error
1516
import urllib.parse
1617
import urllib.request
@@ -20,6 +21,19 @@
2021
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
2122

2223

24+
def normalize(path):
25+
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
26+
return urllib.parse.quote(unquoted, errors='surrogateescape')
27+
28+
def normalize_path(path):
29+
path, sep, query = path.partition('?')
30+
path = normalize(path)
31+
if sep:
32+
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
33+
path += '?' + query
34+
return path
35+
36+
2337
class RobotFileParser:
2438
""" This class provides a set of methods to read, parse and answer
2539
questions about a single robots.txt file.
@@ -55,7 +69,7 @@ def modified(self):
5569
def set_url(self, url):
5670
"""Sets the URL referring to a robots.txt file."""
5771
self.url = url
58-
self.host, self.path = urllib.parse.urlparse(url)[1:3]
72+
self.host, self.path = urllib.parse.urlsplit(url)[1:3]
5973

6074
def read(self):
6175
"""Reads the robots.txt URL and feeds it to the parser."""
@@ -69,7 +83,7 @@ def read(self):
6983
err.close()
7084
else:
7185
raw = f.read()
72-
self.parse(raw.decode("utf-8").splitlines())
86+
self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
7387

7488
def _add_entry(self, entry):
7589
if "*" in entry.useragents:
@@ -113,7 +127,7 @@ def parse(self, lines):
113127
line = line.split(':', 1)
114128
if len(line) == 2:
115129
line[0] = line[0].strip().lower()
116-
line[1] = urllib.parse.unquote(line[1].strip())
130+
line[1] = line[1].strip()
117131
if line[0] == "user-agent":
118132
if state == 2:
119133
self._add_entry(entry)
@@ -167,10 +181,11 @@ def can_fetch(self, useragent, url):
167181
return False
168182
# search for given user agent matches
169183
# the first match counts
170-
parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
171-
url = urllib.parse.urlunparse(('','',parsed_url.path,
172-
parsed_url.params,parsed_url.query, parsed_url.fragment))
173-
url = urllib.parse.quote(url)
184+
# TODO: The private API is used in order to preserve an empty query.
185+
# This is temporary until the public API starts supporting this feature.
186+
parsed_url = urllib.parse._urlsplit(url, '')
187+
url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
188+
url = normalize_path(url)
174189
if not url:
175190
url = "/"
176191
for entry in self.entries:
@@ -213,16 +228,14 @@ def __str__(self):
213228
entries = entries + [self.default_entry]
214229
return '\n\n'.join(map(str, entries))
215230

216-
217231
class RuleLine:
218232
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
219233
(allowance==False) followed by a path."""
220234
def __init__(self, path, allowance):
221235
if path == '' and not allowance:
222236
# an empty value means allow all
223237
allowance = True
224-
path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
225-
self.path = urllib.parse.quote(path)
238+
self.path = normalize_path(path)
226239
self.allowance = allowance
227240

228241
def applies_to(self, filename):
@@ -268,7 +281,7 @@ def applies_to(self, useragent):
268281
def allowance(self, filename):
269282
"""Preconditions:
270283
- our agent applies to this entry
271-
- filename is URL decoded"""
284+
- filename is URL encoded"""
272285
for line in self.rulelines:
273286
if line.applies_to(filename):
274287
return line.allowance
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix parsing errors in the :mod:`urllib.robotparser` module.
2+
Don't fail trying to parse weird paths.
3+
Don't fail trying to decode non-UTF-8 ``robots.txt`` files.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix normalization of the ``robots.txt`` rules and URLs in the
2+
:mod:`urllib.robotparser` module. No longer ignore trailing ``?``.
3+
Distinguish raw special characters ``?``, ``=`` and ``&`` from the
4+
percent-encoded ones.

0 commit comments

Comments
 (0)