Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 95 additions & 76 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,10 @@ def handle_starttag(self, tag: Any, attrs: Any) -> None:
break


def check_anchor(response: requests.requests.Response, anchor: str) -> bool:
"""Reads HTML data from a response object `response` searching for `anchor`.
Returns True if anchor was found, False otherwise.
def contains_anchor(response: Response, anchor: str) -> bool:
"""Determines whether an anchor is contained within an HTTP response
"""
parser = AnchorCheckParser(anchor)
parser = AnchorCheckParser(unquote(anchor))
# Read file in chunks. If we find a matching anchor, we break
# the loop early in hopes not to have to download the whole thing.
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
Expand Down Expand Up @@ -286,16 +285,11 @@ def get_request_headers() -> dict[str, str]:
return {}

def check_uri() -> tuple[str, str, int]:
# split off anchor
if '#' in uri:
req_url, anchor = uri.split('#', 1)
for rex in self.anchors_ignore:
if rex.match(anchor):
anchor = None
break
else:
req_url = uri
anchor = None
req_url, delimiter, anchor = uri.partition('#')
for rex in self.anchors_ignore if delimiter and anchor else []:
if rex.match(anchor):
anchor = None
break

# handle non-ASCII URIs
try:
Expand All @@ -313,71 +307,97 @@ def check_uri() -> tuple[str, str, int]:
# update request headers for the URL
kwargs['headers'] = get_request_headers()

try:
if anchor and self.config.linkcheck_anchors:
# Read the whole document and see if #anchor exists
with requests.get(req_url, stream=True, config=self.config, auth=auth_info,
**kwargs) as response:
response.raise_for_status()
found = check_anchor(response, unquote(anchor))

if not found:
raise Exception(__("Anchor '%s' not found") % anchor)
else:
try:
# try a HEAD request first, which should be easier on
# the server and the network
with requests.head(req_url, allow_redirects=True, config=self.config,
auth=auth_info, **kwargs) as response:
response.raise_for_status()
def retrieval_methods():
if not anchor or not self.config.linkcheck_anchors:
yield lambda: requests.head(
url=req_url,
auth=auth_info,
config=self.config,
allow_redirects=True,
**kwargs,
)
yield lambda: requests.get(
url=req_url,
auth=auth_info,
config=self.config,
stream=True,
**kwargs,
)

# Linkcheck HTTP request logic:
#
# - Attempt HTTP HEAD before HTTP GET unless page content is required.
# - Follow server-issued HTTP redirects.
# - Respect server-issued HTTP 429 backoffs.
error_message = None
for retrieval_attempt in retrieval_methods():
try:
with retrieval_attempt() as response:
if response.ok and anchor and not contains_anchor(response, anchor):
raise Exception(__(f'Anchor {anchor!r} not found'))

# Copy data we need from the (closed) response
status_code, redirect_status_code, retry_after, res_url = (
response.status_code,
response.history[-1].status_code if response.history else None,
response.headers.get('Retry-After'),
response.url,
)
response.raise_for_status()
break

except (ConnectionError, TooManyRedirects) as err:
# Servers drop the connection on HEAD requests, causing
# ConnectionError.
except (ConnectionError, HTTPError, TooManyRedirects) as err:
if isinstance(err, HTTPError) and err.response.status_code == 429:
raise
# retry with GET request if that fails, some servers
# don't like HEAD requests.
with requests.get(req_url, stream=True, config=self.config,
auth=auth_info, **kwargs) as response:
response.raise_for_status()
except HTTPError as err:
if err.response.status_code == 401:
# We'll take "Unauthorized" as working.
return 'working', ' - unauthorized', 0
elif err.response.status_code == 429:
next_check = self.limit_rate(err.response)
if next_check is not None:
self.wqueue.put(CheckRequest(next_check, hyperlink), False)
return 'rate-limited', '', 0
return 'broken', str(err), 0
elif err.response.status_code == 503:
# We'll take "Service Unavailable" as ignored.
return 'ignored', str(err), 0
else:
error_message = str(err)
continue

except HTTPError as err:
error_message = str(err)

# Unauthorized: the reference probably exists
if status_code == 401:
return 'working', 'unauthorized', 0

# Rate limiting; back-off if allowed, or report failure otherwise
if status_code == 429:
if next_check := self.limit_rate(res_url, retry_after):
self.wqueue.put(CheckRequest(next_check, hyperlink), False)
return 'rate-limited', '', 0
return 'broken', error_message, 0

# Don't claim success/failure during server-side outages
if status_code == 503:
return 'ignored', 'service unavailable', 0

# For most HTTP failures, continue attempting alternate retrieval methods
continue

except Exception as err:
# Unhandled exception (intermittent or permanent); report that the
# the link is broken.
return 'broken', str(err), 0
except Exception as err:
return 'broken', str(err), 0

else:
netloc = urlparse(req_url).netloc
try:
del self.rate_limits[netloc]
except KeyError:
pass
if response.url.rstrip('/') == req_url.rstrip('/'):
# All available retrieval methods have been exhausted; report
# that the link is broken.
return 'broken', error_message, 0

# Success; clear rate limits for the origin
netloc = urlparse(req_url).netloc
try:
del self.rate_limits[netloc]
except KeyError:
pass

if res_url.rstrip('/') == req_url.rstrip('/'):
return 'working', '', 0
elif allowed_redirect(req_url, res_url):
return 'working', '', 0
elif redirect_status_code is not None:
return 'redirected', res_url, redirect_status_code
else:
new_url = response.url
if anchor:
new_url += '#' + anchor

if allowed_redirect(req_url, new_url):
return 'working', '', 0
elif response.history:
# history contains any redirects, get last
code = response.history[-1].status_code
return 'redirected', new_url, code
else:
return 'redirected', new_url, 0
return 'redirected', res_url, 0

def allowed_redirect(url: str, new_url: str) -> bool:
return any(
Expand Down Expand Up @@ -451,9 +471,8 @@ def check(docname: str) -> tuple[str, str, int]:
self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code))
self.wqueue.task_done()

def limit_rate(self, response: Response) -> float | None:
def limit_rate(self, res_url: str, retry_after: str) -> float | None:
next_check = None
retry_after = response.headers.get("Retry-After")
if retry_after:
try:
# Integer: time to wait before next attempt.
Expand All @@ -471,7 +490,7 @@ def limit_rate(self, response: Response) -> float | None:
delay = (until - datetime.now(timezone.utc)).total_seconds()
else:
next_check = time.time() + delay
netloc = urlparse(response.url).netloc
netloc = urlparse(res_url).netloc
if next_check is None:
max_delay = self.config.linkcheck_rate_limit_timeout
try:
Expand Down
Empty file added tests/test-server.lock
Empty file.
10 changes: 5 additions & 5 deletions tests/test_build_linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,14 +637,14 @@ class FakeResponse:
def test_limit_rate_default_sleep(app):
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {})
with mock.patch('time.time', return_value=0.0):
next_check = worker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
assert next_check == 60.0


def test_limit_rate_user_max_delay(app):
app.config.linkcheck_rate_limit_timeout = 0.0
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {})
next_check = worker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
assert next_check is None


Expand All @@ -653,7 +653,7 @@ def test_limit_rate_doubles_previous_wait_time(app):
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
rate_limits)
with mock.patch('time.time', return_value=0.0):
next_check = worker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
assert next_check == 120.0


Expand All @@ -663,7 +663,7 @@ def test_limit_rate_clips_wait_time_to_max_time(app):
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
rate_limits)
with mock.patch('time.time', return_value=0.0):
next_check = worker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
assert next_check == 90.0


Expand All @@ -672,7 +672,7 @@ def test_limit_rate_bails_out_after_waiting_max_time(app):
rate_limits = {"localhost": RateLimit(90.0, 0.0)}
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
rate_limits)
next_check = worker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
assert next_check is None


Expand Down