From b861e9e37465d0320bae54db9af3dbe5c7dc6a8f Mon Sep 17 00:00:00 2001 From: James Addison Date: Sat, 20 May 2023 18:32:32 +0100 Subject: [PATCH 1/3] Refactor: linkcheck builder: reduce the lifetime of the 'response' variable --- sphinx/builders/linkcheck.py | 171 +++++++++++++++++++--------------- tests/test-server.lock | 0 tests/test_build_linkcheck.py | 10 +- 3 files changed, 100 insertions(+), 81 deletions(-) create mode 100644 tests/test-server.lock diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 6c99f96e685..4df0d276b98 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -87,11 +87,10 @@ def handle_starttag(self, tag: Any, attrs: Any) -> None: break -def check_anchor(response: requests.requests.Response, anchor: str) -> bool: - """Reads HTML data from a response object `response` searching for `anchor`. - Returns True if anchor was found, False otherwise. +def contains_anchor(response: Response, anchor: str) -> bool: + """Determines whether an anchor is contained within an HTTP response """ - parser = AnchorCheckParser(anchor) + parser = AnchorCheckParser(unquote(anchor)) # Read file in chunks. If we find a matching anchor, we break # the loop early in hopes not to have to download the whole thing. for chunk in response.iter_content(chunk_size=4096, decode_unicode=True): @@ -286,16 +285,11 @@ def get_request_headers() -> dict[str, str]: return {} def check_uri() -> tuple[str, str, int]: - # split off anchor - if '#' in uri: - req_url, anchor = uri.split('#', 1) - for rex in self.anchors_ignore: - if rex.match(anchor): - anchor = None - break - else: - req_url = uri - anchor = None + req_url, delimiter, anchor = uri.partition('#') + for rex in self.anchors_ignore if delimiter and anchor else []: + if rex.match(anchor): + anchor = None + break # handle non-ASCII URIs try: @@ -313,71 +307,97 @@ def check_uri() -> tuple[str, str, int]: # update request headers for the URL kwargs['headers'] = get_request_headers() - try: - if anchor and self.config.linkcheck_anchors: - # Read the whole document and see if #anchor exists - with requests.get(req_url, stream=True, config=self.config, auth=auth_info, - **kwargs) as response: - response.raise_for_status() - found = check_anchor(response, unquote(anchor)) - - if not found: - raise Exception(__("Anchor '%s' not found") % anchor) - else: - try: - # try a HEAD request first, which should be easier on - # the server and the network - with requests.head(req_url, allow_redirects=True, config=self.config, - auth=auth_info, **kwargs) as response: - response.raise_for_status() + def retrieval_methods(): + if not anchor or not self.config.linkcheck_anchors: + yield lambda: requests.head( + url=req_url, + auth=auth_info, + config=self.config, + allow_redirects=True, + **kwargs, + ) + yield lambda: requests.get( + url=req_url, + auth=auth_info, + config=self.config, + stream=True, + **kwargs, + ) + + # Linkcheck HTTP request logic: + # + # - Attempt HTTP HEAD before HTTP GET unless page content is required. + # - Follow server-issued HTTP redirects. + # - Respect server-issued HTTP 429 backoffs. + error_message = None + for retrieval_attempt in retrieval_methods(): + try: + with retrieval_attempt() as response: + if response.ok and anchor and not contains_anchor(response, anchor): + raise Exception(__(f'Anchor {anchor!r} not found')) + + # Copy data we need from the (closed) response + status_code, redirect_status_code, retry_after, res_url = ( + response.status_code, + response.history[-1].status_code if response.history else None, + response.headers.get('Retry-After'), + response.url, + ) + response.raise_for_status() + break + + except (ConnectionError, TooManyRedirects) as err: # Servers drop the connection on HEAD requests, causing # ConnectionError. - except (ConnectionError, HTTPError, TooManyRedirects) as err: - if isinstance(err, HTTPError) and err.response.status_code == 429: - raise - # retry with GET request if that fails, some servers - # don't like HEAD requests. - with requests.get(req_url, stream=True, config=self.config, - auth=auth_info, **kwargs) as response: - response.raise_for_status() - except HTTPError as err: - if err.response.status_code == 401: - # We'll take "Unauthorized" as working. - return 'working', ' - unauthorized', 0 - elif err.response.status_code == 429: - next_check = self.limit_rate(err.response) - if next_check is not None: - self.wqueue.put(CheckRequest(next_check, hyperlink), False) - return 'rate-limited', '', 0 - return 'broken', str(err), 0 - elif err.response.status_code == 503: - # We'll take "Service Unavailable" as ignored. - return 'ignored', str(err), 0 - else: + error_message = str(err) + continue + + except HTTPError as err: + error_message = str(err) + + # Unauthorized: the reference probably exists + if status_code == 401: + return 'working', 'unauthorized', 0 + + # Rate limiting; back-off if allowed, or report failure otherwise + if status_code == 429: + if next_check := self.limit_rate(res_url, retry_after): + self.wqueue.put(CheckRequest(next_check, hyperlink), False) + return 'rate-limited', '', 0 + return 'broken', error_message, 0 + + # Don't claim success/failure during server-side outages + if status_code == 503: + return 'ignored', 'service unavailable', 0 + + # For most HTTP failures, continue attempting alternate retrieval methods + continue + + except Exception as err: + # Unhandled exception (intermittent or permanent); report that the + # the link is broken. return 'broken', str(err), 0 - except Exception as err: - return 'broken', str(err), 0 + else: - netloc = urlparse(req_url).netloc - try: - del self.rate_limits[netloc] - except KeyError: - pass - if response.url.rstrip('/') == req_url.rstrip('/'): + # All available retrieval methods have been exhausted; report + # that the link is broken. + return 'broken', error_message, 0 + + # Success; clear rate limits for the origin + netloc = urlparse(req_url).netloc + try: + del self.rate_limits[netloc] + except KeyError: + pass + + if res_url.rstrip('/') == req_url.rstrip('/'): + return 'working', '', 0 + elif allowed_redirect(req_url, res_url): return 'working', '', 0 + elif redirect_status_code is not None: + return 'redirected', res_url, redirect_status_code else: - new_url = response.url - if anchor: - new_url += '#' + anchor - - if allowed_redirect(req_url, new_url): - return 'working', '', 0 - elif response.history: - # history contains any redirects, get last - code = response.history[-1].status_code - return 'redirected', new_url, code - else: - return 'redirected', new_url, 0 + return 'redirected', res_url, 0 def allowed_redirect(url: str, new_url: str) -> bool: return any( @@ -451,9 +471,8 @@ def check(docname: str) -> tuple[str, str, int]: self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code)) self.wqueue.task_done() - def limit_rate(self, response: Response) -> float | None: + def limit_rate(self, res_url: str, retry_after: str) -> float | None: next_check = None - retry_after = response.headers.get("Retry-After") if retry_after: try: # Integer: time to wait before next attempt. @@ -471,7 +490,7 @@ def limit_rate(self, response: Response) -> float | None: delay = (until - datetime.now(timezone.utc)).total_seconds() else: next_check = time.time() + delay - netloc = urlparse(response.url).netloc + netloc = urlparse(res_url).netloc if next_check is None: max_delay = self.config.linkcheck_rate_limit_timeout try: diff --git a/tests/test-server.lock b/tests/test-server.lock new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index 260cf2c4214..f4229cccd68 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -637,14 +637,14 @@ class FakeResponse: def test_limit_rate_default_sleep(app): worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {}) with mock.patch('time.time', return_value=0.0): - next_check = worker.limit_rate(FakeResponse()) + next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After")) assert next_check == 60.0 def test_limit_rate_user_max_delay(app): app.config.linkcheck_rate_limit_timeout = 0.0 worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {}) - next_check = worker.limit_rate(FakeResponse()) + next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After")) assert next_check is None @@ -653,7 +653,7 @@ def test_limit_rate_doubles_previous_wait_time(app): worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), rate_limits) with mock.patch('time.time', return_value=0.0): - next_check = worker.limit_rate(FakeResponse()) + next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After")) assert next_check == 120.0 @@ -663,7 +663,7 @@ def test_limit_rate_clips_wait_time_to_max_time(app): worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), rate_limits) with mock.patch('time.time', return_value=0.0): - next_check = worker.limit_rate(FakeResponse()) + next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After")) assert next_check == 90.0 @@ -672,7 +672,7 @@ def test_limit_rate_bails_out_after_waiting_max_time(app): rate_limits = {"localhost": RateLimit(90.0, 0.0)} worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), rate_limits) - next_check = worker.limit_rate(FakeResponse()) + next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After")) assert next_check is None From f9757d1a4eb4f1a39429125e15574f01281179b8 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Thu, 20 Jul 2023 20:51:00 +0100 Subject: [PATCH 2/3] rm test-server.lock --- tests/test-server.lock | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/test-server.lock diff --git a/tests/test-server.lock b/tests/test-server.lock deleted file mode 100644 index e69de29bb2d..00000000000 From 7c63054a826cb969ec8dbccfd394d99ab291b74d Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Thu, 20 Jul 2023 20:56:23 +0100 Subject: [PATCH 3/3] Update --- sphinx/builders/linkcheck.py | 83 +++++++++++++++++------------------- 1 file changed, 39 insertions(+), 44 deletions(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 4df0d276b98..f4299d14c17 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -13,8 +13,8 @@ from os import path from queue import PriorityQueue, Queue from threading import Thread -from typing import Any, Generator, NamedTuple, Tuple, Union, cast -from urllib.parse import unquote, urlparse, urlunparse +from typing import Any, Callable, Generator, Iterator, NamedTuple, Tuple, Union, cast +from urllib.parse import unquote, urlparse, urlsplit, urlunparse from docutils import nodes from requests import Response @@ -72,7 +72,7 @@ class RateLimit(NamedTuple): class AnchorCheckParser(HTMLParser): - """Specialized HTML parser that looks for a specific anchor.""" + """Specialised HTML parser that looks for a specific anchor.""" def __init__(self, search_anchor: str) -> None: super().__init__() @@ -88,8 +88,8 @@ def handle_starttag(self, tag: Any, attrs: Any) -> None: def contains_anchor(response: Response, anchor: str) -> bool: - """Determines whether an anchor is contained within an HTTP response - """ + """Determine if an anchor is contained within an HTTP response.""" + parser = AnchorCheckParser(unquote(anchor)) # Read file in chunks. If we find a matching anchor, we break # the loop early in hopes not to have to download the whole thing. @@ -270,7 +270,7 @@ def run(self) -> None: kwargs['timeout'] = self.config.linkcheck_timeout def get_request_headers() -> dict[str, str]: - url = urlparse(uri) + url = urlsplit(uri) candidates = [f"{url.scheme}://{url.netloc}", f"{url.scheme}://{url.netloc}/", uri, @@ -288,7 +288,7 @@ def check_uri() -> tuple[str, str, int]: req_url, delimiter, anchor = uri.partition('#') for rex in self.anchors_ignore if delimiter and anchor else []: if rex.match(anchor): - anchor = None + anchor = '' break # handle non-ASCII URIs @@ -307,43 +307,30 @@ def check_uri() -> tuple[str, str, int]: # update request headers for the URL kwargs['headers'] = get_request_headers() - def retrieval_methods(): - if not anchor or not self.config.linkcheck_anchors: - yield lambda: requests.head( - url=req_url, - auth=auth_info, - config=self.config, - allow_redirects=True, - **kwargs, - ) - yield lambda: requests.get( - url=req_url, - auth=auth_info, - config=self.config, - stream=True, - **kwargs, - ) - # Linkcheck HTTP request logic: # # - Attempt HTTP HEAD before HTTP GET unless page content is required. # - Follow server-issued HTTP redirects. - # - Respect server-issued HTTP 429 backoffs. + # - Respect server-issued HTTP 429 back-offs. error_message = None - for retrieval_attempt in retrieval_methods(): + status_code = -1 + response_url = retry_after = '' + for retrieval_method, retrieval_kwargs in _retrieval_methods( + self.config.linkcheck_anchors, anchor, + ): try: - with retrieval_attempt() as response: + with retrieval_method(url=req_url, auth=auth_info, config=self.config, + **retrieval_kwargs, **kwargs) as response: if response.ok and anchor and not contains_anchor(response, anchor): raise Exception(__(f'Anchor {anchor!r} not found')) # Copy data we need from the (closed) response - status_code, redirect_status_code, retry_after, res_url = ( - response.status_code, - response.history[-1].status_code if response.history else None, - response.headers.get('Retry-After'), - response.url, - ) + status_code = response.status_code + redirect_status_code = response.history[-1].status_code if response.history else None # NoQA: E501 + retry_after = response.headers.get('Retry-After') + response_url = f'{response.url}' response.raise_for_status() + del response break except (ConnectionError, TooManyRedirects) as err: @@ -355,13 +342,13 @@ def retrieval_methods(): except HTTPError as err: error_message = str(err) - # Unauthorized: the reference probably exists + # Unauthorised: the reference probably exists if status_code == 401: return 'working', 'unauthorized', 0 # Rate limiting; back-off if allowed, or report failure otherwise if status_code == 429: - if next_check := self.limit_rate(res_url, retry_after): + if next_check := self.limit_rate(response_url, retry_after): self.wqueue.put(CheckRequest(next_check, hyperlink), False) return 'rate-limited', '', 0 return 'broken', error_message, 0 @@ -384,20 +371,19 @@ def retrieval_methods(): return 'broken', error_message, 0 # Success; clear rate limits for the origin - netloc = urlparse(req_url).netloc + netloc = urlsplit(req_url).netloc try: del self.rate_limits[netloc] except KeyError: pass - if res_url.rstrip('/') == req_url.rstrip('/'): - return 'working', '', 0 - elif allowed_redirect(req_url, res_url): + if ((response_url.rstrip('/') == req_url.rstrip('/')) + or allowed_redirect(req_url, response_url)): return 'working', '', 0 elif redirect_status_code is not None: - return 'redirected', res_url, redirect_status_code + return 'redirected', response_url, redirect_status_code else: - return 'redirected', res_url, 0 + return 'redirected', response_url, 0 def allowed_redirect(url: str, new_url: str) -> bool: return any( @@ -448,7 +434,7 @@ def check(docname: str) -> tuple[str, str, int]: if uri is None: break - netloc = urlparse(uri).netloc + netloc = urlsplit(uri).netloc try: # Refresh rate limit. # When there are many links in the queue, workers are all stuck waiting @@ -471,7 +457,7 @@ def check(docname: str) -> tuple[str, str, int]: self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code)) self.wqueue.task_done() - def limit_rate(self, res_url: str, retry_after: str) -> float | None: + def limit_rate(self, response_url: str, retry_after: str) -> float | None: next_check = None if retry_after: try: @@ -490,7 +476,7 @@ def limit_rate(self, res_url: str, retry_after: str) -> float | None: delay = (until - datetime.now(timezone.utc)).total_seconds() else: next_check = time.time() + delay - netloc = urlparse(res_url).netloc + netloc = urlsplit(response_url).netloc if next_check is None: max_delay = self.config.linkcheck_rate_limit_timeout try: @@ -509,6 +495,15 @@ def limit_rate(self, res_url: str, retry_after: str) -> float | None: return next_check +def _retrieval_methods( + linkcheck_anchors: bool, + anchor: str, +) -> Iterator[tuple[Callable, dict[str, bool]]]: + if not linkcheck_anchors or not anchor: + yield requests.head, {'allow_redirects': True} + yield requests.get, {'stream': True} + + class HyperlinkCollector(SphinxPostTransform): builders = ('linkcheck',) default_priority = 800