Skip to content

Commit e45fb5e

Browse files
Reduce the lifetime of response in the linkcheck builder (#11432)
Co-authored-by: Adam Turner <[email protected]>
1 parent ecc8613 commit e45fb5e

File tree

2 files changed

+101
-87
lines changed

2 files changed

+101
-87
lines changed

sphinx/builders/linkcheck.py

Lines changed: 96 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
from os import path
1414
from queue import PriorityQueue, Queue
1515
from threading import Thread
16-
from typing import Any, Generator, NamedTuple, Tuple, Union, cast
17-
from urllib.parse import unquote, urlparse, urlunparse
16+
from typing import Any, Callable, Generator, Iterator, NamedTuple, Tuple, Union, cast
17+
from urllib.parse import unquote, urlparse, urlsplit, urlunparse
1818

1919
from docutils import nodes
2020
from requests import Response
@@ -72,7 +72,7 @@ class RateLimit(NamedTuple):
7272

7373

7474
class AnchorCheckParser(HTMLParser):
75-
"""Specialized HTML parser that looks for a specific anchor."""
75+
"""Specialised HTML parser that looks for a specific anchor."""
7676

7777
def __init__(self, search_anchor: str) -> None:
7878
super().__init__()
@@ -87,11 +87,10 @@ def handle_starttag(self, tag: Any, attrs: Any) -> None:
8787
break
8888

8989

90-
def check_anchor(response: requests.requests.Response, anchor: str) -> bool:
91-
"""Reads HTML data from a response object `response` searching for `anchor`.
92-
Returns True if anchor was found, False otherwise.
93-
"""
94-
parser = AnchorCheckParser(anchor)
90+
def contains_anchor(response: Response, anchor: str) -> bool:
91+
"""Determine if an anchor is contained within an HTTP response."""
92+
93+
parser = AnchorCheckParser(unquote(anchor))
9594
# Read file in chunks. If we find a matching anchor, we break
9695
# the loop early in hopes not to have to download the whole thing.
9796
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
@@ -271,7 +270,7 @@ def run(self) -> None:
271270
kwargs['timeout'] = self.config.linkcheck_timeout
272271

273272
def get_request_headers() -> dict[str, str]:
274-
url = urlparse(uri)
273+
url = urlsplit(uri)
275274
candidates = [f"{url.scheme}://{url.netloc}",
276275
f"{url.scheme}://{url.netloc}/",
277276
uri,
@@ -286,16 +285,11 @@ def get_request_headers() -> dict[str, str]:
286285
return {}
287286

288287
def check_uri() -> tuple[str, str, int]:
289-
# split off anchor
290-
if '#' in uri:
291-
req_url, anchor = uri.split('#', 1)
292-
for rex in self.anchors_ignore:
293-
if rex.match(anchor):
294-
anchor = None
295-
break
296-
else:
297-
req_url = uri
298-
anchor = None
288+
req_url, delimiter, anchor = uri.partition('#')
289+
for rex in self.anchors_ignore if delimiter and anchor else []:
290+
if rex.match(anchor):
291+
anchor = ''
292+
break
299293

300294
# handle non-ASCII URIs
301295
try:
@@ -313,71 +307,83 @@ def check_uri() -> tuple[str, str, int]:
313307
# update request headers for the URL
314308
kwargs['headers'] = get_request_headers()
315309

316-
try:
317-
if anchor and self.config.linkcheck_anchors:
318-
# Read the whole document and see if #anchor exists
319-
with requests.get(req_url, stream=True, config=self.config, auth=auth_info,
320-
**kwargs) as response:
321-
response.raise_for_status()
322-
found = check_anchor(response, unquote(anchor))
323-
324-
if not found:
325-
raise Exception(__("Anchor '%s' not found") % anchor)
326-
else:
327-
try:
328-
# try a HEAD request first, which should be easier on
329-
# the server and the network
330-
with requests.head(req_url, allow_redirects=True, config=self.config,
331-
auth=auth_info, **kwargs) as response:
332-
response.raise_for_status()
310+
# Linkcheck HTTP request logic:
311+
#
312+
# - Attempt HTTP HEAD before HTTP GET unless page content is required.
313+
# - Follow server-issued HTTP redirects.
314+
# - Respect server-issued HTTP 429 back-offs.
315+
error_message = None
316+
status_code = -1
317+
response_url = retry_after = ''
318+
for retrieval_method, retrieval_kwargs in _retrieval_methods(
319+
self.config.linkcheck_anchors, anchor,
320+
):
321+
try:
322+
with retrieval_method(url=req_url, auth=auth_info, config=self.config,
323+
**retrieval_kwargs, **kwargs) as response:
324+
if response.ok and anchor and not contains_anchor(response, anchor):
325+
raise Exception(__(f'Anchor {anchor!r} not found'))
326+
327+
# Copy data we need from the (closed) response
328+
status_code = response.status_code
329+
redirect_status_code = response.history[-1].status_code if response.history else None # NoQA: E501
330+
retry_after = response.headers.get('Retry-After')
331+
response_url = f'{response.url}'
332+
response.raise_for_status()
333+
del response
334+
break
335+
336+
except (ConnectionError, TooManyRedirects) as err:
333337
# Servers drop the connection on HEAD requests, causing
334338
# ConnectionError.
335-
except (ConnectionError, HTTPError, TooManyRedirects) as err:
336-
if isinstance(err, HTTPError) and err.response.status_code == 429:
337-
raise
338-
# retry with GET request if that fails, some servers
339-
# don't like HEAD requests.
340-
with requests.get(req_url, stream=True, config=self.config,
341-
auth=auth_info, **kwargs) as response:
342-
response.raise_for_status()
343-
except HTTPError as err:
344-
if err.response.status_code == 401:
345-
# We'll take "Unauthorized" as working.
346-
return 'working', ' - unauthorized', 0
347-
elif err.response.status_code == 429:
348-
next_check = self.limit_rate(err.response)
349-
if next_check is not None:
350-
self.wqueue.put(CheckRequest(next_check, hyperlink), False)
351-
return 'rate-limited', '', 0
352-
return 'broken', str(err), 0
353-
elif err.response.status_code == 503:
354-
# We'll take "Service Unavailable" as ignored.
355-
return 'ignored', str(err), 0
356-
else:
339+
error_message = str(err)
340+
continue
341+
342+
except HTTPError as err:
343+
error_message = str(err)
344+
345+
# Unauthorised: the reference probably exists
346+
if status_code == 401:
347+
return 'working', 'unauthorized', 0
348+
349+
# Rate limiting; back-off if allowed, or report failure otherwise
350+
if status_code == 429:
351+
if next_check := self.limit_rate(response_url, retry_after):
352+
self.wqueue.put(CheckRequest(next_check, hyperlink), False)
353+
return 'rate-limited', '', 0
354+
return 'broken', error_message, 0
355+
356+
# Don't claim success/failure during server-side outages
357+
if status_code == 503:
358+
return 'ignored', 'service unavailable', 0
359+
360+
# For most HTTP failures, continue attempting alternate retrieval methods
361+
continue
362+
363+
except Exception as err:
364+
# Unhandled exception (intermittent or permanent); report that the
365+
# the link is broken.
357366
return 'broken', str(err), 0
358-
except Exception as err:
359-
return 'broken', str(err), 0
367+
360368
else:
361-
netloc = urlparse(req_url).netloc
362-
try:
363-
del self.rate_limits[netloc]
364-
except KeyError:
365-
pass
366-
if response.url.rstrip('/') == req_url.rstrip('/'):
369+
# All available retrieval methods have been exhausted; report
370+
# that the link is broken.
371+
return 'broken', error_message, 0
372+
373+
# Success; clear rate limits for the origin
374+
netloc = urlsplit(req_url).netloc
375+
try:
376+
del self.rate_limits[netloc]
377+
except KeyError:
378+
pass
379+
380+
if ((response_url.rstrip('/') == req_url.rstrip('/'))
381+
or allowed_redirect(req_url, response_url)):
367382
return 'working', '', 0
383+
elif redirect_status_code is not None:
384+
return 'redirected', response_url, redirect_status_code
368385
else:
369-
new_url = response.url
370-
if anchor:
371-
new_url += '#' + anchor
372-
373-
if allowed_redirect(req_url, new_url):
374-
return 'working', '', 0
375-
elif response.history:
376-
# history contains any redirects, get last
377-
code = response.history[-1].status_code
378-
return 'redirected', new_url, code
379-
else:
380-
return 'redirected', new_url, 0
386+
return 'redirected', response_url, 0
381387

382388
def allowed_redirect(url: str, new_url: str) -> bool:
383389
return any(
@@ -428,7 +434,7 @@ def check(docname: str) -> tuple[str, str, int]:
428434

429435
if uri is None:
430436
break
431-
netloc = urlparse(uri).netloc
437+
netloc = urlsplit(uri).netloc
432438
try:
433439
# Refresh rate limit.
434440
# When there are many links in the queue, workers are all stuck waiting
@@ -451,9 +457,8 @@ def check(docname: str) -> tuple[str, str, int]:
451457
self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code))
452458
self.wqueue.task_done()
453459

454-
def limit_rate(self, response: Response) -> float | None:
460+
def limit_rate(self, response_url: str, retry_after: str) -> float | None:
455461
next_check = None
456-
retry_after = response.headers.get("Retry-After")
457462
if retry_after:
458463
try:
459464
# Integer: time to wait before next attempt.
@@ -471,7 +476,7 @@ def limit_rate(self, response: Response) -> float | None:
471476
delay = (until - datetime.now(timezone.utc)).total_seconds()
472477
else:
473478
next_check = time.time() + delay
474-
netloc = urlparse(response.url).netloc
479+
netloc = urlsplit(response_url).netloc
475480
if next_check is None:
476481
max_delay = self.config.linkcheck_rate_limit_timeout
477482
try:
@@ -490,6 +495,15 @@ def limit_rate(self, response: Response) -> float | None:
490495
return next_check
491496

492497

498+
def _retrieval_methods(
499+
linkcheck_anchors: bool,
500+
anchor: str,
501+
) -> Iterator[tuple[Callable, dict[str, bool]]]:
502+
if not linkcheck_anchors or not anchor:
503+
yield requests.head, {'allow_redirects': True}
504+
yield requests.get, {'stream': True}
505+
506+
493507
class HyperlinkCollector(SphinxPostTransform):
494508
builders = ('linkcheck',)
495509
default_priority = 800

tests/test_build_linkcheck.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -637,14 +637,14 @@ class FakeResponse:
637637
def test_limit_rate_default_sleep(app):
638638
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {})
639639
with mock.patch('time.time', return_value=0.0):
640-
next_check = worker.limit_rate(FakeResponse())
640+
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
641641
assert next_check == 60.0
642642

643643

644644
def test_limit_rate_user_max_delay(app):
645645
app.config.linkcheck_rate_limit_timeout = 0.0
646646
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {})
647-
next_check = worker.limit_rate(FakeResponse())
647+
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
648648
assert next_check is None
649649

650650

@@ -653,7 +653,7 @@ def test_limit_rate_doubles_previous_wait_time(app):
653653
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
654654
rate_limits)
655655
with mock.patch('time.time', return_value=0.0):
656-
next_check = worker.limit_rate(FakeResponse())
656+
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
657657
assert next_check == 120.0
658658

659659

@@ -663,7 +663,7 @@ def test_limit_rate_clips_wait_time_to_max_time(app):
663663
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
664664
rate_limits)
665665
with mock.patch('time.time', return_value=0.0):
666-
next_check = worker.limit_rate(FakeResponse())
666+
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
667667
assert next_check == 90.0
668668

669669

@@ -672,7 +672,7 @@ def test_limit_rate_bails_out_after_waiting_max_time(app):
672672
rate_limits = {"localhost": RateLimit(90.0, 0.0)}
673673
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
674674
rate_limits)
675-
next_check = worker.limit_rate(FakeResponse())
675+
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
676676
assert next_check is None
677677

678678

0 commit comments

Comments
 (0)