1313from os import path
1414from queue import PriorityQueue , Queue
1515from threading import Thread
16- from typing import Any , Generator , NamedTuple , Tuple , Union , cast
17- from urllib .parse import unquote , urlparse , urlunparse
16+ from typing import Any , Callable , Generator , Iterator , NamedTuple , Tuple , Union , cast
17+ from urllib .parse import unquote , urlparse , urlsplit , urlunparse
1818
1919from docutils import nodes
2020from requests import Response
@@ -72,7 +72,7 @@ class RateLimit(NamedTuple):
7272
7373
7474class AnchorCheckParser (HTMLParser ):
75- """Specialized HTML parser that looks for a specific anchor."""
75+ """Specialised HTML parser that looks for a specific anchor."""
7676
7777 def __init__ (self , search_anchor : str ) -> None :
7878 super ().__init__ ()
@@ -87,11 +87,10 @@ def handle_starttag(self, tag: Any, attrs: Any) -> None:
8787 break
8888
8989
90- def check_anchor (response : requests .requests .Response , anchor : str ) -> bool :
91- """Reads HTML data from a response object `response` searching for `anchor`.
92- Returns True if anchor was found, False otherwise.
93- """
94- parser = AnchorCheckParser (anchor )
90+ def contains_anchor (response : Response , anchor : str ) -> bool :
91+ """Determine if an anchor is contained within an HTTP response."""
92+
93+ parser = AnchorCheckParser (unquote (anchor ))
9594 # Read file in chunks. If we find a matching anchor, we break
9695 # the loop early in hopes not to have to download the whole thing.
9796 for chunk in response .iter_content (chunk_size = 4096 , decode_unicode = True ):
@@ -271,7 +270,7 @@ def run(self) -> None:
271270 kwargs ['timeout' ] = self .config .linkcheck_timeout
272271
273272 def get_request_headers () -> dict [str , str ]:
274- url = urlparse (uri )
273+ url = urlsplit (uri )
275274 candidates = [f"{ url .scheme } ://{ url .netloc } " ,
276275 f"{ url .scheme } ://{ url .netloc } /" ,
277276 uri ,
@@ -286,16 +285,11 @@ def get_request_headers() -> dict[str, str]:
286285 return {}
287286
288287 def check_uri () -> tuple [str , str , int ]:
289- # split off anchor
290- if '#' in uri :
291- req_url , anchor = uri .split ('#' , 1 )
292- for rex in self .anchors_ignore :
293- if rex .match (anchor ):
294- anchor = None
295- break
296- else :
297- req_url = uri
298- anchor = None
288+ req_url , delimiter , anchor = uri .partition ('#' )
289+ for rex in self .anchors_ignore if delimiter and anchor else []:
290+ if rex .match (anchor ):
291+ anchor = ''
292+ break
299293
300294 # handle non-ASCII URIs
301295 try :
@@ -313,71 +307,83 @@ def check_uri() -> tuple[str, str, int]:
313307 # update request headers for the URL
314308 kwargs ['headers' ] = get_request_headers ()
315309
316- try :
317- if anchor and self .config .linkcheck_anchors :
318- # Read the whole document and see if #anchor exists
319- with requests .get (req_url , stream = True , config = self .config , auth = auth_info ,
320- ** kwargs ) as response :
321- response .raise_for_status ()
322- found = check_anchor (response , unquote (anchor ))
323-
324- if not found :
325- raise Exception (__ ("Anchor '%s' not found" ) % anchor )
326- else :
327- try :
328- # try a HEAD request first, which should be easier on
329- # the server and the network
330- with requests .head (req_url , allow_redirects = True , config = self .config ,
331- auth = auth_info , ** kwargs ) as response :
332- response .raise_for_status ()
310+ # Linkcheck HTTP request logic:
311+ #
312+ # - Attempt HTTP HEAD before HTTP GET unless page content is required.
313+ # - Follow server-issued HTTP redirects.
314+ # - Respect server-issued HTTP 429 back-offs.
315+ error_message = None
316+ status_code = - 1
317+ response_url = retry_after = ''
318+ for retrieval_method , retrieval_kwargs in _retrieval_methods (
319+ self .config .linkcheck_anchors , anchor ,
320+ ):
321+ try :
322+ with retrieval_method (url = req_url , auth = auth_info , config = self .config ,
323+ ** retrieval_kwargs , ** kwargs ) as response :
324+ if response .ok and anchor and not contains_anchor (response , anchor ):
325+ raise Exception (__ (f'Anchor { anchor !r} not found' ))
326+
327+ # Copy data we need from the (closed) response
328+ status_code = response .status_code
329+ redirect_status_code = response .history [- 1 ].status_code if response .history else None # NoQA: E501
330+ retry_after = response .headers .get ('Retry-After' )
331+ response_url = f'{ response .url } '
332+ response .raise_for_status ()
333+ del response
334+ break
335+
336+ except (ConnectionError , TooManyRedirects ) as err :
333337 # Servers drop the connection on HEAD requests, causing
334338 # ConnectionError.
335- except (ConnectionError , HTTPError , TooManyRedirects ) as err :
336- if isinstance (err , HTTPError ) and err .response .status_code == 429 :
337- raise
338- # retry with GET request if that fails, some servers
339- # don't like HEAD requests.
340- with requests .get (req_url , stream = True , config = self .config ,
341- auth = auth_info , ** kwargs ) as response :
342- response .raise_for_status ()
343- except HTTPError as err :
344- if err .response .status_code == 401 :
345- # We'll take "Unauthorized" as working.
346- return 'working' , ' - unauthorized' , 0
347- elif err .response .status_code == 429 :
348- next_check = self .limit_rate (err .response )
349- if next_check is not None :
350- self .wqueue .put (CheckRequest (next_check , hyperlink ), False )
351- return 'rate-limited' , '' , 0
352- return 'broken' , str (err ), 0
353- elif err .response .status_code == 503 :
354- # We'll take "Service Unavailable" as ignored.
355- return 'ignored' , str (err ), 0
356- else :
339+ error_message = str (err )
340+ continue
341+
342+ except HTTPError as err :
343+ error_message = str (err )
344+
345+ # Unauthorised: the reference probably exists
346+ if status_code == 401 :
347+ return 'working' , 'unauthorized' , 0
348+
349+ # Rate limiting; back-off if allowed, or report failure otherwise
350+ if status_code == 429 :
351+ if next_check := self .limit_rate (response_url , retry_after ):
352+ self .wqueue .put (CheckRequest (next_check , hyperlink ), False )
353+ return 'rate-limited' , '' , 0
354+ return 'broken' , error_message , 0
355+
356+ # Don't claim success/failure during server-side outages
357+ if status_code == 503 :
358+ return 'ignored' , 'service unavailable' , 0
359+
360+ # For most HTTP failures, continue attempting alternate retrieval methods
361+ continue
362+
363+ except Exception as err :
364+ # Unhandled exception (intermittent or permanent); report that the
365+ # the link is broken.
357366 return 'broken' , str (err ), 0
358- except Exception as err :
359- return 'broken' , str (err ), 0
367+
360368 else :
361- netloc = urlparse (req_url ).netloc
362- try :
363- del self .rate_limits [netloc ]
364- except KeyError :
365- pass
366- if response .url .rstrip ('/' ) == req_url .rstrip ('/' ):
369+ # All available retrieval methods have been exhausted; report
370+ # that the link is broken.
371+ return 'broken' , error_message , 0
372+
373+ # Success; clear rate limits for the origin
374+ netloc = urlsplit (req_url ).netloc
375+ try :
376+ del self .rate_limits [netloc ]
377+ except KeyError :
378+ pass
379+
380+ if ((response_url .rstrip ('/' ) == req_url .rstrip ('/' ))
381+ or allowed_redirect (req_url , response_url )):
367382 return 'working' , '' , 0
383+ elif redirect_status_code is not None :
384+ return 'redirected' , response_url , redirect_status_code
368385 else :
369- new_url = response .url
370- if anchor :
371- new_url += '#' + anchor
372-
373- if allowed_redirect (req_url , new_url ):
374- return 'working' , '' , 0
375- elif response .history :
376- # history contains any redirects, get last
377- code = response .history [- 1 ].status_code
378- return 'redirected' , new_url , code
379- else :
380- return 'redirected' , new_url , 0
386+ return 'redirected' , response_url , 0
381387
382388 def allowed_redirect (url : str , new_url : str ) -> bool :
383389 return any (
@@ -428,7 +434,7 @@ def check(docname: str) -> tuple[str, str, int]:
428434
429435 if uri is None :
430436 break
431- netloc = urlparse (uri ).netloc
437+ netloc = urlsplit (uri ).netloc
432438 try :
433439 # Refresh rate limit.
434440 # When there are many links in the queue, workers are all stuck waiting
@@ -451,9 +457,8 @@ def check(docname: str) -> tuple[str, str, int]:
451457 self .rqueue .put (CheckResult (uri , docname , lineno , status , info , code ))
452458 self .wqueue .task_done ()
453459
454- def limit_rate (self , response : Response ) -> float | None :
460+ def limit_rate (self , response_url : str , retry_after : str ) -> float | None :
455461 next_check = None
456- retry_after = response .headers .get ("Retry-After" )
457462 if retry_after :
458463 try :
459464 # Integer: time to wait before next attempt.
@@ -471,7 +476,7 @@ def limit_rate(self, response: Response) -> float | None:
471476 delay = (until - datetime .now (timezone .utc )).total_seconds ()
472477 else :
473478 next_check = time .time () + delay
474- netloc = urlparse ( response . url ).netloc
479+ netloc = urlsplit ( response_url ).netloc
475480 if next_check is None :
476481 max_delay = self .config .linkcheck_rate_limit_timeout
477482 try :
@@ -490,6 +495,15 @@ def limit_rate(self, response: Response) -> float | None:
490495 return next_check
491496
492497
498+ def _retrieval_methods (
499+ linkcheck_anchors : bool ,
500+ anchor : str ,
501+ ) -> Iterator [tuple [Callable , dict [str , bool ]]]:
502+ if not linkcheck_anchors or not anchor :
503+ yield requests .head , {'allow_redirects' : True }
504+ yield requests .get , {'stream' : True }
505+
506+
493507class HyperlinkCollector (SphinxPostTransform ):
494508 builders = ('linkcheck' ,)
495509 default_priority = 800
0 commit comments