From b861e9e37465d0320bae54db9af3dbe5c7dc6a8f Mon Sep 17 00:00:00 2001
From: James Addison <jay@jp-hosting.net>
Date: Sat, 20 May 2023 18:32:32 +0100
Subject: [PATCH 1/3] Refactor: linkcheck builder: reduce the lifetime of the
 'response' variable

---
 sphinx/builders/linkcheck.py  | 171 +++++++++++++++++++---------------
 tests/test-server.lock        |   0
 tests/test_build_linkcheck.py |  10 +-
 3 files changed, 100 insertions(+), 81 deletions(-)
 create mode 100644 tests/test-server.lock

diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
index 6c99f96e685..4df0d276b98 100644
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -87,11 +87,10 @@ def handle_starttag(self, tag: Any, attrs: Any) -> None:
                 break
 
 
-def check_anchor(response: requests.requests.Response, anchor: str) -> bool:
-    """Reads HTML data from a response object `response` searching for `anchor`.
-    Returns True if anchor was found, False otherwise.
+def contains_anchor(response: Response, anchor: str) -> bool:
+    """Determines whether an anchor is contained within an HTTP response
     """
-    parser = AnchorCheckParser(anchor)
+    parser = AnchorCheckParser(unquote(anchor))
     # Read file in chunks. If we find a matching anchor, we break
     # the loop early in hopes not to have to download the whole thing.
     for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
@@ -286,16 +285,11 @@ def get_request_headers() -> dict[str, str]:
             return {}
 
         def check_uri() -> tuple[str, str, int]:
-            # split off anchor
-            if '#' in uri:
-                req_url, anchor = uri.split('#', 1)
-                for rex in self.anchors_ignore:
-                    if rex.match(anchor):
-                        anchor = None
-                        break
-            else:
-                req_url = uri
-                anchor = None
+            req_url, delimiter, anchor = uri.partition('#')
+            for rex in self.anchors_ignore if delimiter and anchor else []:
+                if rex.match(anchor):
+                    anchor = None
+                    break
 
             # handle non-ASCII URIs
             try:
@@ -313,71 +307,97 @@ def check_uri() -> tuple[str, str, int]:
             # update request headers for the URL
             kwargs['headers'] = get_request_headers()
 
-            try:
-                if anchor and self.config.linkcheck_anchors:
-                    # Read the whole document and see if #anchor exists
-                    with requests.get(req_url, stream=True, config=self.config, auth=auth_info,
-                                      **kwargs) as response:
-                        response.raise_for_status()
-                        found = check_anchor(response, unquote(anchor))
-
-                    if not found:
-                        raise Exception(__("Anchor '%s' not found") % anchor)
-                else:
-                    try:
-                        # try a HEAD request first, which should be easier on
-                        # the server and the network
-                        with requests.head(req_url, allow_redirects=True, config=self.config,
-                                           auth=auth_info, **kwargs) as response:
-                            response.raise_for_status()
+            def retrieval_methods():
+                if not anchor or not self.config.linkcheck_anchors:
+                    yield lambda: requests.head(
+                        url=req_url,
+                        auth=auth_info,
+                        config=self.config,
+                        allow_redirects=True,
+                        **kwargs,
+                    )
+                yield lambda: requests.get(
+                    url=req_url,
+                    auth=auth_info,
+                    config=self.config,
+                    stream=True,
+                    **kwargs,
+                )
+
+            # Linkcheck HTTP request logic:
+            #
+            # - Attempt HTTP HEAD before HTTP GET unless page content is required.
+            # - Follow server-issued HTTP redirects.
+            # - Respect server-issued HTTP 429 backoffs.
+            error_message = None
+            for retrieval_attempt in retrieval_methods():
+                try:
+                    with retrieval_attempt() as response:
+                        if response.ok and anchor and not contains_anchor(response, anchor):
+                            raise Exception(__(f'Anchor {anchor!r} not found'))
+
+                    # Copy data we need from the (closed) response
+                    status_code, redirect_status_code, retry_after, res_url = (
+                        response.status_code,
+                        response.history[-1].status_code if response.history else None,
+                        response.headers.get('Retry-After'),
+                        response.url,
+                    )
+                    response.raise_for_status()
+                    break
+
+                except (ConnectionError, TooManyRedirects) as err:
                     # Servers drop the connection on HEAD requests, causing
                     # ConnectionError.
-                    except (ConnectionError, HTTPError, TooManyRedirects) as err:
-                        if isinstance(err, HTTPError) and err.response.status_code == 429:
-                            raise
-                        # retry with GET request if that fails, some servers
-                        # don't like HEAD requests.
-                        with requests.get(req_url, stream=True, config=self.config,
-                                          auth=auth_info, **kwargs) as response:
-                            response.raise_for_status()
-            except HTTPError as err:
-                if err.response.status_code == 401:
-                    # We'll take "Unauthorized" as working.
-                    return 'working', ' - unauthorized', 0
-                elif err.response.status_code == 429:
-                    next_check = self.limit_rate(err.response)
-                    if next_check is not None:
-                        self.wqueue.put(CheckRequest(next_check, hyperlink), False)
-                        return 'rate-limited', '', 0
-                    return 'broken', str(err), 0
-                elif err.response.status_code == 503:
-                    # We'll take "Service Unavailable" as ignored.
-                    return 'ignored', str(err), 0
-                else:
+                    error_message = str(err)
+                    continue
+
+                except HTTPError as err:
+                    error_message = str(err)
+
+                    # Unauthorized: the reference probably exists
+                    if status_code == 401:
+                        return 'working', 'unauthorized', 0
+
+                    # Rate limiting; back-off if allowed, or report failure otherwise
+                    if status_code == 429:
+                        if next_check := self.limit_rate(res_url, retry_after):
+                            self.wqueue.put(CheckRequest(next_check, hyperlink), False)
+                            return 'rate-limited', '', 0
+                        return 'broken', error_message, 0
+
+                    # Don't claim success/failure during server-side outages
+                    if status_code == 503:
+                        return 'ignored', 'service unavailable', 0
+
+                    # For most HTTP failures, continue attempting alternate retrieval methods
+                    continue
+
+                except Exception as err:
+                    # Unhandled exception (intermittent or permanent); report that the
+                    # the link is broken.
                     return 'broken', str(err), 0
-            except Exception as err:
-                return 'broken', str(err), 0
+
             else:
-                netloc = urlparse(req_url).netloc
-                try:
-                    del self.rate_limits[netloc]
-                except KeyError:
-                    pass
-            if response.url.rstrip('/') == req_url.rstrip('/'):
+                # All available retrieval methods have been exhausted; report
+                # that the link is broken.
+                return 'broken', error_message, 0
+
+            # Success; clear rate limits for the origin
+            netloc = urlparse(req_url).netloc
+            try:
+                del self.rate_limits[netloc]
+            except KeyError:
+                pass
+
+            if res_url.rstrip('/') == req_url.rstrip('/'):
+                return 'working', '', 0
+            elif allowed_redirect(req_url, res_url):
                 return 'working', '', 0
+            elif redirect_status_code is not None:
+                return 'redirected', res_url, redirect_status_code
             else:
-                new_url = response.url
-                if anchor:
-                    new_url += '#' + anchor
-
-                if allowed_redirect(req_url, new_url):
-                    return 'working', '', 0
-                elif response.history:
-                    # history contains any redirects, get last
-                    code = response.history[-1].status_code
-                    return 'redirected', new_url, code
-                else:
-                    return 'redirected', new_url, 0
+                return 'redirected', res_url, 0
 
         def allowed_redirect(url: str, new_url: str) -> bool:
             return any(
@@ -451,9 +471,8 @@ def check(docname: str) -> tuple[str, str, int]:
                 self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code))
             self.wqueue.task_done()
 
-    def limit_rate(self, response: Response) -> float | None:
+    def limit_rate(self, res_url: str, retry_after: str) -> float | None:
         next_check = None
-        retry_after = response.headers.get("Retry-After")
         if retry_after:
             try:
                 # Integer: time to wait before next attempt.
@@ -471,7 +490,7 @@ def limit_rate(self, response: Response) -> float | None:
                     delay = (until - datetime.now(timezone.utc)).total_seconds()
             else:
                 next_check = time.time() + delay
-        netloc = urlparse(response.url).netloc
+        netloc = urlparse(res_url).netloc
         if next_check is None:
             max_delay = self.config.linkcheck_rate_limit_timeout
             try:
diff --git a/tests/test-server.lock b/tests/test-server.lock
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py
index 260cf2c4214..f4229cccd68 100644
--- a/tests/test_build_linkcheck.py
+++ b/tests/test_build_linkcheck.py
@@ -637,14 +637,14 @@ class FakeResponse:
 def test_limit_rate_default_sleep(app):
     worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {})
     with mock.patch('time.time', return_value=0.0):
-        next_check = worker.limit_rate(FakeResponse())
+        next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
     assert next_check == 60.0
 
 
 def test_limit_rate_user_max_delay(app):
     app.config.linkcheck_rate_limit_timeout = 0.0
     worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {})
-    next_check = worker.limit_rate(FakeResponse())
+    next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
     assert next_check is None
 
 
@@ -653,7 +653,7 @@ def test_limit_rate_doubles_previous_wait_time(app):
     worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
                                               rate_limits)
     with mock.patch('time.time', return_value=0.0):
-        next_check = worker.limit_rate(FakeResponse())
+        next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
     assert next_check == 120.0
 
 
@@ -663,7 +663,7 @@ def test_limit_rate_clips_wait_time_to_max_time(app):
     worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
                                               rate_limits)
     with mock.patch('time.time', return_value=0.0):
-        next_check = worker.limit_rate(FakeResponse())
+        next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
     assert next_check == 90.0
 
 
@@ -672,7 +672,7 @@ def test_limit_rate_bails_out_after_waiting_max_time(app):
     rate_limits = {"localhost": RateLimit(90.0, 0.0)}
     worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
                                               rate_limits)
-    next_check = worker.limit_rate(FakeResponse())
+    next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
     assert next_check is None
 
 

From f9757d1a4eb4f1a39429125e15574f01281179b8 Mon Sep 17 00:00:00 2001
From: Adam Turner <9087854+aa-turner@users.noreply.github.com>
Date: Thu, 20 Jul 2023 20:51:00 +0100
Subject: [PATCH 2/3] rm test-server.lock

---
 tests/test-server.lock | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/test-server.lock

diff --git a/tests/test-server.lock b/tests/test-server.lock
deleted file mode 100644
index e69de29bb2d..00000000000

From 7c63054a826cb969ec8dbccfd394d99ab291b74d Mon Sep 17 00:00:00 2001
From: Adam Turner <9087854+aa-turner@users.noreply.github.com>
Date: Thu, 20 Jul 2023 20:56:23 +0100
Subject: [PATCH 3/3] Update

---
 sphinx/builders/linkcheck.py | 83 +++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 44 deletions(-)

diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
index 4df0d276b98..f4299d14c17 100644
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -13,8 +13,8 @@
 from os import path
 from queue import PriorityQueue, Queue
 from threading import Thread
-from typing import Any, Generator, NamedTuple, Tuple, Union, cast
-from urllib.parse import unquote, urlparse, urlunparse
+from typing import Any, Callable, Generator, Iterator, NamedTuple, Tuple, Union, cast
+from urllib.parse import unquote, urlparse, urlsplit, urlunparse
 
 from docutils import nodes
 from requests import Response
@@ -72,7 +72,7 @@ class RateLimit(NamedTuple):
 
 
 class AnchorCheckParser(HTMLParser):
-    """Specialized HTML parser that looks for a specific anchor."""
+    """Specialised HTML parser that looks for a specific anchor."""
 
     def __init__(self, search_anchor: str) -> None:
         super().__init__()
@@ -88,8 +88,8 @@ def handle_starttag(self, tag: Any, attrs: Any) -> None:
 
 
 def contains_anchor(response: Response, anchor: str) -> bool:
-    """Determines whether an anchor is contained within an HTTP response
-    """
+    """Determine if an anchor is contained within an HTTP response."""
+
     parser = AnchorCheckParser(unquote(anchor))
     # Read file in chunks. If we find a matching anchor, we break
     # the loop early in hopes not to have to download the whole thing.
@@ -270,7 +270,7 @@ def run(self) -> None:
             kwargs['timeout'] = self.config.linkcheck_timeout
 
         def get_request_headers() -> dict[str, str]:
-            url = urlparse(uri)
+            url = urlsplit(uri)
             candidates = [f"{url.scheme}://{url.netloc}",
                           f"{url.scheme}://{url.netloc}/",
                           uri,
@@ -288,7 +288,7 @@ def check_uri() -> tuple[str, str, int]:
             req_url, delimiter, anchor = uri.partition('#')
             for rex in self.anchors_ignore if delimiter and anchor else []:
                 if rex.match(anchor):
-                    anchor = None
+                    anchor = ''
                     break
 
             # handle non-ASCII URIs
@@ -307,43 +307,30 @@ def check_uri() -> tuple[str, str, int]:
             # update request headers for the URL
             kwargs['headers'] = get_request_headers()
 
-            def retrieval_methods():
-                if not anchor or not self.config.linkcheck_anchors:
-                    yield lambda: requests.head(
-                        url=req_url,
-                        auth=auth_info,
-                        config=self.config,
-                        allow_redirects=True,
-                        **kwargs,
-                    )
-                yield lambda: requests.get(
-                    url=req_url,
-                    auth=auth_info,
-                    config=self.config,
-                    stream=True,
-                    **kwargs,
-                )
-
             # Linkcheck HTTP request logic:
             #
             # - Attempt HTTP HEAD before HTTP GET unless page content is required.
             # - Follow server-issued HTTP redirects.
-            # - Respect server-issued HTTP 429 backoffs.
+            # - Respect server-issued HTTP 429 back-offs.
             error_message = None
-            for retrieval_attempt in retrieval_methods():
+            status_code = -1
+            response_url = retry_after = ''
+            for retrieval_method, retrieval_kwargs in _retrieval_methods(
+                    self.config.linkcheck_anchors, anchor,
+            ):
                 try:
-                    with retrieval_attempt() as response:
+                    with retrieval_method(url=req_url, auth=auth_info, config=self.config,
+                                          **retrieval_kwargs, **kwargs) as response:
                         if response.ok and anchor and not contains_anchor(response, anchor):
                             raise Exception(__(f'Anchor {anchor!r} not found'))
 
                     # Copy data we need from the (closed) response
-                    status_code, redirect_status_code, retry_after, res_url = (
-                        response.status_code,
-                        response.history[-1].status_code if response.history else None,
-                        response.headers.get('Retry-After'),
-                        response.url,
-                    )
+                    status_code = response.status_code
+                    redirect_status_code = response.history[-1].status_code if response.history else None  # NoQA: E501
+                    retry_after = response.headers.get('Retry-After')
+                    response_url = f'{response.url}'
                     response.raise_for_status()
+                    del response
                     break
 
                 except (ConnectionError, TooManyRedirects) as err:
@@ -355,13 +342,13 @@ def retrieval_methods():
                 except HTTPError as err:
                     error_message = str(err)
 
-                    # Unauthorized: the reference probably exists
+                    # Unauthorised: the reference probably exists
                     if status_code == 401:
                         return 'working', 'unauthorized', 0
 
                     # Rate limiting; back-off if allowed, or report failure otherwise
                     if status_code == 429:
-                        if next_check := self.limit_rate(res_url, retry_after):
+                        if next_check := self.limit_rate(response_url, retry_after):
                             self.wqueue.put(CheckRequest(next_check, hyperlink), False)
                             return 'rate-limited', '', 0
                         return 'broken', error_message, 0
@@ -384,20 +371,19 @@ def retrieval_methods():
                 return 'broken', error_message, 0
 
             # Success; clear rate limits for the origin
-            netloc = urlparse(req_url).netloc
+            netloc = urlsplit(req_url).netloc
             try:
                 del self.rate_limits[netloc]
             except KeyError:
                 pass
 
-            if res_url.rstrip('/') == req_url.rstrip('/'):
-                return 'working', '', 0
-            elif allowed_redirect(req_url, res_url):
+            if ((response_url.rstrip('/') == req_url.rstrip('/'))
+                    or allowed_redirect(req_url, response_url)):
                 return 'working', '', 0
             elif redirect_status_code is not None:
-                return 'redirected', res_url, redirect_status_code
+                return 'redirected', response_url, redirect_status_code
             else:
-                return 'redirected', res_url, 0
+                return 'redirected', response_url, 0
 
         def allowed_redirect(url: str, new_url: str) -> bool:
             return any(
@@ -448,7 +434,7 @@ def check(docname: str) -> tuple[str, str, int]:
 
             if uri is None:
                 break
-            netloc = urlparse(uri).netloc
+            netloc = urlsplit(uri).netloc
             try:
                 # Refresh rate limit.
                 # When there are many links in the queue, workers are all stuck waiting
@@ -471,7 +457,7 @@ def check(docname: str) -> tuple[str, str, int]:
                 self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code))
             self.wqueue.task_done()
 
-    def limit_rate(self, res_url: str, retry_after: str) -> float | None:
+    def limit_rate(self, response_url: str, retry_after: str) -> float | None:
         next_check = None
         if retry_after:
             try:
@@ -490,7 +476,7 @@ def limit_rate(self, res_url: str, retry_after: str) -> float | None:
                     delay = (until - datetime.now(timezone.utc)).total_seconds()
             else:
                 next_check = time.time() + delay
-        netloc = urlparse(res_url).netloc
+        netloc = urlsplit(response_url).netloc
         if next_check is None:
             max_delay = self.config.linkcheck_rate_limit_timeout
             try:
@@ -509,6 +495,15 @@ def limit_rate(self, res_url: str, retry_after: str) -> float | None:
         return next_check
 
 
+def _retrieval_methods(
+    linkcheck_anchors: bool,
+    anchor: str,
+) -> Iterator[tuple[Callable, dict[str, bool]]]:
+    if not linkcheck_anchors or not anchor:
+        yield requests.head, {'allow_redirects': True}
+    yield requests.get, {'stream': True}
+
+
 class HyperlinkCollector(SphinxPostTransform):
     builders = ('linkcheck',)
     default_priority = 800