Reduce the lifetime of response in the linkcheck builder (#11432)

jayaddison · AA-Turner · web-flow · commit e45fb5e61b6e · 2023-07-20T21:14:00.000+01:00
Co-authored-by: Adam Turner &lt;9087854+aa-turner@users.noreply.github.com&gt;
diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
@@ -13,8 +13,8 @@
 from os import path
 from queue import PriorityQueue, Queue
 from threading import Thread
-from typing import Any, Generator, NamedTuple, Tuple, Union, cast
-from urllib.parse import unquote, urlparse, urlunparse
+from typing import Any, Callable, Generator, Iterator, NamedTuple, Tuple, Union, cast
+from urllib.parse import unquote, urlparse, urlsplit, urlunparse
 
 from docutils import nodes
 from requests import Response
@@ -72,7 +72,7 @@ class RateLimit(NamedTuple):
 
 
 class AnchorCheckParser(HTMLParser):
-    """Specialized HTML parser that looks for a specific anchor."""
+    """Specialised HTML parser that looks for a specific anchor."""
 
     def __init__(self, search_anchor: str) -> None:
         super().__init__()
@@ -87,11 +87,10 @@ def handle_starttag(self, tag: Any, attrs: Any) -> None:
                 break
 
 
-def check_anchor(response: requests.requests.Response, anchor: str) -> bool:
-    """Reads HTML data from a response object `response` searching for `anchor`.
-    Returns True if anchor was found, False otherwise.
-    """
-    parser = AnchorCheckParser(anchor)
+def contains_anchor(response: Response, anchor: str) -> bool:
+    """Determine if an anchor is contained within an HTTP response."""
+
+    parser = AnchorCheckParser(unquote(anchor))
     # Read file in chunks. If we find a matching anchor, we break
     # the loop early in hopes not to have to download the whole thing.
     for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
@@ -271,7 +270,7 @@ def run(self) -> None:
             kwargs['timeout'] = self.config.linkcheck_timeout
 
         def get_request_headers() -> dict[str, str]:
-            url = urlparse(uri)
+            url = urlsplit(uri)
             candidates = [f"{url.scheme}://{url.netloc}",
                           f"{url.scheme}://{url.netloc}/",
                           uri,
@@ -286,16 +285,11 @@ def get_request_headers() -> dict[str, str]:
             return {}
 
         def check_uri() -> tuple[str, str, int]:
-            # split off anchor
-            if '#' in uri:
-                req_url, anchor = uri.split('#', 1)
-                for rex in self.anchors_ignore:
-                    if rex.match(anchor):
-                        anchor = None
-                        break
-            else:
-                req_url = uri
-                anchor = None
+            req_url, delimiter, anchor = uri.partition('#')
+            for rex in self.anchors_ignore if delimiter and anchor else []:
+                if rex.match(anchor):
+                    anchor = ''
+                    break
 
             # handle non-ASCII URIs
             try:
@@ -313,71 +307,83 @@ def check_uri() -> tuple[str, str, int]:
             # update request headers for the URL
             kwargs['headers'] = get_request_headers()
 
-            try:
-                if anchor and self.config.linkcheck_anchors:
-                    # Read the whole document and see if #anchor exists
-                    with requests.get(req_url, stream=True, config=self.config, auth=auth_info,
-                                      **kwargs) as response:
-                        response.raise_for_status()
-                        found = check_anchor(response, unquote(anchor))
-
-                    if not found:
-                        raise Exception(__("Anchor '%s' not found") % anchor)
-                else:
-                    try:
-                        # try a HEAD request first, which should be easier on
-                        # the server and the network
-                        with requests.head(req_url, allow_redirects=True, config=self.config,
-                                           auth=auth_info, **kwargs) as response:
-                            response.raise_for_status()
+            # Linkcheck HTTP request logic:
+            #
+            # - Attempt HTTP HEAD before HTTP GET unless page content is required.
+            # - Follow server-issued HTTP redirects.
+            # - Respect server-issued HTTP 429 back-offs.
+            error_message = None
+            status_code = -1
+            response_url = retry_after = ''
+            for retrieval_method, retrieval_kwargs in _retrieval_methods(
+                    self.config.linkcheck_anchors, anchor,
+            ):
+                try:
+                    with retrieval_method(url=req_url, auth=auth_info, config=self.config,
+                                          **retrieval_kwargs, **kwargs) as response:
+                        if response.ok and anchor and not contains_anchor(response, anchor):
+                            raise Exception(__(f'Anchor {anchor!r} not found'))
+
+                    # Copy data we need from the (closed) response
+                    status_code = response.status_code
+                    redirect_status_code = response.history[-1].status_code if response.history else None  # NoQA: E501
+                    retry_after = response.headers.get('Retry-After')
+                    response_url = f'{response.url}'
+                    response.raise_for_status()
+                    del response
+                    break
+
+                except (ConnectionError, TooManyRedirects) as err:
                     # Servers drop the connection on HEAD requests, causing
                     # ConnectionError.
-                    except (ConnectionError, HTTPError, TooManyRedirects) as err:
-                        if isinstance(err, HTTPError) and err.response.status_code == 429:
-                            raise
-                        # retry with GET request if that fails, some servers
-                        # don't like HEAD requests.
-                        with requests.get(req_url, stream=True, config=self.config,
-                                          auth=auth_info, **kwargs) as response:
-                            response.raise_for_status()
-            except HTTPError as err:
-                if err.response.status_code == 401:
-                    # We'll take "Unauthorized" as working.
-                    return 'working', ' - unauthorized', 0
-                elif err.response.status_code == 429:
-                    next_check = self.limit_rate(err.response)
-                    if next_check is not None:
-                        self.wqueue.put(CheckRequest(next_check, hyperlink), False)
-                        return 'rate-limited', '', 0
-                    return 'broken', str(err), 0
-                elif err.response.status_code == 503:
-                    # We'll take "Service Unavailable" as ignored.
-                    return 'ignored', str(err), 0
-                else:
+                    error_message = str(err)
+                    continue
+
+                except HTTPError as err:
+                    error_message = str(err)
+
+                    # Unauthorised: the reference probably exists
+                    if status_code == 401:
+                        return 'working', 'unauthorized', 0
+
+                    # Rate limiting; back-off if allowed, or report failure otherwise
+                    if status_code == 429:
+                        if next_check := self.limit_rate(response_url, retry_after):
+                            self.wqueue.put(CheckRequest(next_check, hyperlink), False)
+                            return 'rate-limited', '', 0
+                        return 'broken', error_message, 0
+
+                    # Don't claim success/failure during server-side outages
+                    if status_code == 503:
+                        return 'ignored', 'service unavailable', 0
+
+                    # For most HTTP failures, continue attempting alternate retrieval methods
+                    continue
+
+                except Exception as err:
+                    # Unhandled exception (intermittent or permanent); report that the
+                    # the link is broken.
                     return 'broken', str(err), 0
-            except Exception as err:
-                return 'broken', str(err), 0
+
             else:
-                netloc = urlparse(req_url).netloc
-                try:
-                    del self.rate_limits[netloc]
-                except KeyError:
-                    pass
-            if response.url.rstrip('/') == req_url.rstrip('/'):
+                # All available retrieval methods have been exhausted; report
+                # that the link is broken.
+                return 'broken', error_message, 0
+
+            # Success; clear rate limits for the origin
+            netloc = urlsplit(req_url).netloc
+            try:
+                del self.rate_limits[netloc]
+            except KeyError:
+                pass
+
+            if ((response_url.rstrip('/') == req_url.rstrip('/'))
+                    or allowed_redirect(req_url, response_url)):
                 return 'working', '', 0
+            elif redirect_status_code is not None:
+                return 'redirected', response_url, redirect_status_code
             else:
-                new_url = response.url
-                if anchor:
-                    new_url += '#' + anchor
-
-                if allowed_redirect(req_url, new_url):
-                    return 'working', '', 0
-                elif response.history:
-                    # history contains any redirects, get last
-                    code = response.history[-1].status_code
-                    return 'redirected', new_url, code
-                else:
-                    return 'redirected', new_url, 0
+                return 'redirected', response_url, 0
 
         def allowed_redirect(url: str, new_url: str) -> bool:
             return any(
@@ -428,7 +434,7 @@ def check(docname: str) -> tuple[str, str, int]:
 
             if uri is None:
                 break
-            netloc = urlparse(uri).netloc
+            netloc = urlsplit(uri).netloc
             try:
                 # Refresh rate limit.
                 # When there are many links in the queue, workers are all stuck waiting
@@ -451,9 +457,8 @@ def check(docname: str) -> tuple[str, str, int]:
                 self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code))
             self.wqueue.task_done()
 
-    def limit_rate(self, response: Response) -> float | None:
+    def limit_rate(self, response_url: str, retry_after: str) -> float | None:
         next_check = None
-        retry_after = response.headers.get("Retry-After")
         if retry_after:
             try:
                 # Integer: time to wait before next attempt.
@@ -471,7 +476,7 @@ def limit_rate(self, response: Response) -> float | None:
                     delay = (until - datetime.now(timezone.utc)).total_seconds()
             else:
                 next_check = time.time() + delay
-        netloc = urlparse(response.url).netloc
+        netloc = urlsplit(response_url).netloc
         if next_check is None:
             max_delay = self.config.linkcheck_rate_limit_timeout
             try:
@@ -490,6 +495,15 @@ def limit_rate(self, response: Response) -> float | None:
         return next_check
 
 
+def _retrieval_methods(
+    linkcheck_anchors: bool,
+    anchor: str,
+) -> Iterator[tuple[Callable, dict[str, bool]]]:
+    if not linkcheck_anchors or not anchor:
+        yield requests.head, {'allow_redirects': True}
+    yield requests.get, {'stream': True}
+
+
 class HyperlinkCollector(SphinxPostTransform):
     builders = ('linkcheck',)
     default_priority = 800
diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py
@@ -637,14 +637,14 @@ class FakeResponse:
 def test_limit_rate_default_sleep(app):
     worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {})
     with mock.patch('time.time', return_value=0.0):
-        next_check = worker.limit_rate(FakeResponse())
+        next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
     assert next_check == 60.0
 
 
 def test_limit_rate_user_max_delay(app):
     app.config.linkcheck_rate_limit_timeout = 0.0
     worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {})
-    next_check = worker.limit_rate(FakeResponse())
+    next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
     assert next_check is None
 
 
@@ -653,7 +653,7 @@ def test_limit_rate_doubles_previous_wait_time(app):
     worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
                                               rate_limits)
     with mock.patch('time.time', return_value=0.0):
-        next_check = worker.limit_rate(FakeResponse())
+        next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
     assert next_check == 120.0
 
 
@@ -663,7 +663,7 @@ def test_limit_rate_clips_wait_time_to_max_time(app):
     worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
                                               rate_limits)
     with mock.patch('time.time', return_value=0.0):
-        next_check = worker.limit_rate(FakeResponse())
+        next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
     assert next_check == 90.0
 
 
@@ -672,7 +672,7 @@ def test_limit_rate_bails_out_after_waiting_max_time(app):
     rate_limits = {"localhost": RateLimit(90.0, 0.0)}
     worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
                                               rate_limits)
-    next_check = worker.limit_rate(FakeResponse())
+    next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
     assert next_check is None