Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Lib/test/test_robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,17 @@ def test_string_formatting(self):
self.assertEqual(str(self.parser), self.expected_output)


class WeirdUnquotePathTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *

# This can be interpreted as weird unquoted path, or an URL with invalid IPv6
# host as well.
Disallow: //[foo]/bar
"""
pass


class RobotHandler(BaseHTTPRequestHandler):

def do_GET(self):
Expand Down Expand Up @@ -388,5 +399,6 @@ def test_read_404(self):
self.assertIsNone(parser.crawl_delay('*'))
self.assertIsNone(parser.request_rate('*'))


if __name__=='__main__':
unittest.main()
11 changes: 10 additions & 1 deletion Lib/urllib/robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@
RequestRate = collections.namedtuple("RequestRate", "requests seconds")


def normalize_path(path):
query, fragment = '', ''
if '#' in path:
path, fragment = path.split('#', 1)
if '?' in path:
path, query = path.split('?', 1)
return urllib.parse.urlunsplit(('', '', path, query, fragment))


class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
Expand Down Expand Up @@ -219,7 +228,7 @@ def __init__(self, path, allowance):
if path == '' and not allowance:
# an empty value means allow all
allowance = True
path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
path = normalize_path(path)
self.path = urllib.parse.quote(path)
self.allowance = allowance

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Don't treat path in robots.txt as URL in :func:`urllib.robotparser`.