diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index e33723cc70c877..ae6fe2ef2ce773 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -384,26 +384,23 @@ def test_string_formatting(self): ) class BaseLocalNetworkTestCase: - def setUp(self): + @classmethod + def setUpClass(cls): # clear _opener global variable - self.addCleanup(urllib.request.urlcleanup) + cls.addClassCleanup(urllib.request.urlcleanup) - self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler) + cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler) + cls.addClassCleanup(cls.server.server_close) - self.t = threading.Thread( + t = threading.Thread( name='HTTPServer serving', - target=self.server.serve_forever, + target=cls.server.serve_forever, # Short poll interval to make the test finish quickly. # Time between requests is short enough that we won't wake # up spuriously too many times. kwargs={'poll_interval':0.01}) - self.t.daemon = True # In case this function raises. - self.t.start() - - def tearDown(self): - self.server.shutdown() - self.t.join() - self.server.server_close() + cls.enterClassContext(threading_helper.start_threads([t])) + cls.addClassCleanup(cls.server.shutdown) SAMPLE_ROBOTS_TXT = b'''\ @@ -425,7 +422,6 @@ def do_GET(self): def log_message(self, format, *args): pass - @threading_helper.reap_threads def testRead(self): # Test that reading a weird robots.txt doesn't fail. addr = self.server.server_address @@ -447,17 +443,21 @@ def testRead(self): self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path')) -class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase): +class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase): class RobotHandler(BaseHTTPRequestHandler): def do_GET(self): - self.send_error(403, "Forbidden access") + self.send_error(self.server.return_code) def log_message(self, format, *args): pass - @threading_helper.reap_threads + def setUp(self): + # Make sure that a valid code is set in the test. + self.server.return_code = None + def testPasswordProtectedSite(self): + self.server.return_code = 403 addr = self.server.server_address url = 'http://' + socket_helper.HOST + ':' + str(addr[1]) robots_url = url + "/robots.txt" @@ -465,6 +465,40 @@ def testPasswordProtectedSite(self): parser.set_url(url) parser.read() self.assertFalse(parser.can_fetch("*", robots_url)) + self.assertFalse(parser.can_fetch("*", url + '/some/file.html')) + + def testNotFound(self): + self.server.return_code = 404 + addr = self.server.server_address + url = f'http://{socket_helper.HOST}:{addr[1]}' + robots_url = url + "/robots.txt" + parser = urllib.robotparser.RobotFileParser() + parser.set_url(url) + parser.read() + self.assertTrue(parser.can_fetch("*", robots_url)) + self.assertTrue(parser.can_fetch("*", url + '/path/file.html')) + + def testTeapot(self): + self.server.return_code = 418 + addr = self.server.server_address + url = f'http://{socket_helper.HOST}:{addr[1]}' + robots_url = url + "/robots.txt" + parser = urllib.robotparser.RobotFileParser() + parser.set_url(url) + parser.read() + self.assertTrue(parser.can_fetch("*", robots_url)) + self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream')) + + def testServiceUnavailable(self): + self.server.return_code = 503 + addr = self.server.server_address + url = f'http://{socket_helper.HOST}:{addr[1]}' + robots_url = url + "/robots.txt" + parser = urllib.robotparser.RobotFileParser() + parser.set_url(url) + parser.read() + self.assertFalse(parser.can_fetch("*", robots_url)) + self.assertFalse(parser.can_fetch("*", url + '/path/file.html')) @support.requires_working_socket() diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 4009fd6b58f594..76297d028f07e6 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -77,9 +77,17 @@ def read(self): f = urllib.request.urlopen(self.url) except urllib.error.HTTPError as err: if err.code in (401, 403): + # If access to robot.txt has the status Unauthorized/Forbidden, + # then most likely this applies to the entire site. self.disallow_all = True - elif err.code >= 400 and err.code < 500: + elif 400 <= err.code < 500: + # RFC 9309, Section 2.3.1.3: the crawler MAY access any + # resources on the server. self.allow_all = True + elif 500 <= err.code < 600: + # RFC 9309, Section 2.3.1.4: the crawler MUST assume + # complete disallow. + self.disallow_all = True err.close() else: raw = f.read() diff --git a/Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst b/Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst new file mode 100644 index 00000000000000..bd9fff0bc2e31b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst @@ -0,0 +1,2 @@ +Disallow all access in :mod:`urllib.robotparser` if the ``robots.txt`` file +is unreachable due to server or network errors.