Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 50 additions & 16 deletions Lib/test/test_robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,26 +384,23 @@ def test_string_formatting(self):
)
class BaseLocalNetworkTestCase:

def setUp(self):
@classmethod
def setUpClass(cls):
# clear _opener global variable
self.addCleanup(urllib.request.urlcleanup)
cls.addClassCleanup(urllib.request.urlcleanup)

self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler)
cls.addClassCleanup(cls.server.server_close)

self.t = threading.Thread(
t = threading.Thread(
name='HTTPServer serving',
target=self.server.serve_forever,
target=cls.server.serve_forever,
# Short poll interval to make the test finish quickly.
# Time between requests is short enough that we won't wake
# up spuriously too many times.
kwargs={'poll_interval':0.01})
self.t.daemon = True # In case this function raises.
self.t.start()

def tearDown(self):
self.server.shutdown()
self.t.join()
self.server.server_close()
cls.enterClassContext(threading_helper.start_threads([t]))
cls.addClassCleanup(cls.server.shutdown)


SAMPLE_ROBOTS_TXT = b'''\
Expand All @@ -425,7 +422,6 @@ def do_GET(self):
def log_message(self, format, *args):
pass

@threading_helper.reap_threads
def testRead(self):
# Test that reading a weird robots.txt doesn't fail.
addr = self.server.server_address
Expand All @@ -447,24 +443,62 @@ def testRead(self):
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))


class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
class RobotHandler(BaseHTTPRequestHandler):

def do_GET(self):
self.send_error(403, "Forbidden access")
self.send_error(self.server.return_code)

def log_message(self, format, *args):
pass

@threading_helper.reap_threads
def setUp(self):
# Make sure that a valid code is set in the test.
self.server.return_code = None

def testPasswordProtectedSite(self):
self.server.return_code = 403
addr = self.server.server_address
url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertFalse(parser.can_fetch("*", robots_url))
self.assertFalse(parser.can_fetch("*", url + '/some/file.html'))

def testNotFound(self):
self.server.return_code = 404
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertTrue(parser.can_fetch("*", robots_url))
self.assertTrue(parser.can_fetch("*", url + '/path/file.html'))

def testTeapot(self):
self.server.return_code = 418
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertTrue(parser.can_fetch("*", robots_url))
self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream'))

def testServiceUnavailable(self):
self.server.return_code = 503
addr = self.server.server_address
url = f'http://{socket_helper.HOST}:{addr[1]}'
robots_url = url + "/robots.txt"
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertFalse(parser.can_fetch("*", robots_url))
self.assertFalse(parser.can_fetch("*", url + '/path/file.html'))


@support.requires_working_socket()
Expand Down
10 changes: 9 additions & 1 deletion Lib/urllib/robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,17 @@ def read(self):
f = urllib.request.urlopen(self.url)
except urllib.error.HTTPError as err:
if err.code in (401, 403):
# If access to robot.txt has the status Unauthorized/Forbidden,
# then most likely this applies to the entire site.
self.disallow_all = True
elif err.code >= 400 and err.code < 500:
elif 400 <= err.code < 500:
# RFC 9309, Section 2.3.1.3: the crawler MAY access any
# resources on the server.
self.allow_all = True
elif 500 <= err.code < 600:
# RFC 9309, Section 2.3.1.4: the crawler MUST assume
# complete disallow.
self.disallow_all = True
err.close()
else:
raw = f.read()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Disallow all access in :mod:`urllib.robotparser` if the ``robots.txt`` file
is unreachable due to server or network errors.
Loading