diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 8d89e2a8224452..15b3cf0e9fa473 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -299,6 +299,17 @@ def test_string_formatting(self): self.assertEqual(str(self.parser), self.expected_output) +class WeirdUnquotePathTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +User-agent: * + +# This can be interpreted as weird unquoted path, or an URL with invalid IPv6 +# host as well. +Disallow: //[foo]/bar + """ + pass + + class RobotHandler(BaseHTTPRequestHandler): def do_GET(self): @@ -388,5 +399,6 @@ def test_read_404(self): self.assertIsNone(parser.crawl_delay('*')) self.assertIsNone(parser.request_rate('*')) + if __name__=='__main__': unittest.main() diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index c58565e3945146..b51a82928b6988 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -19,6 +19,15 @@ RequestRate = collections.namedtuple("RequestRate", "requests seconds") +def normalize_path(path): + query, fragment = '', '' + if '#' in path: + path, fragment = path.split('#', 1) + if '?' in path: + path, query = path.split('?', 1) + return urllib.parse.urlunsplit(('', '', path, query, fragment)) + + class RobotFileParser: """ This class provides a set of methods to read, parse and answer questions about a single robots.txt file. @@ -219,7 +228,7 @@ def __init__(self, path, allowance): if path == '' and not allowance: # an empty value means allow all allowance = True - path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) + path = normalize_path(path) self.path = urllib.parse.quote(path) self.allowance = allowance diff --git a/Misc/NEWS.d/next/Library/2023-12-17-22-38-10.gh-issue-111788.twWfD-.rst b/Misc/NEWS.d/next/Library/2023-12-17-22-38-10.gh-issue-111788.twWfD-.rst new file mode 100644 index 00000000000000..9f71524d36fdd8 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-12-17-22-38-10.gh-issue-111788.twWfD-.rst @@ -0,0 +1 @@ +Don't treat path in robots.txt as URL in :func:`urllib.robotparser`.