1919 - Validates links by making HTTP requests
2020 - Supports parallel validation for better performance
2121 - Provides detailed error reporting for broken links
22+ - Soft-passes HTTP 429 responses for specific domains (e.g., HashiCorp docs) to avoid CI flakiness
2223
2324Usage Examples:
2425 # Find all links containing 'docs.zenml.io' in a directory
3940 # Use custom URL path mappings
4041 python link_checker.py --dir docs --replace-links --url-mapping user-guide=user-guides
4142
43+ # Soft-pass 429 for HashiCorp docs and skip HEAD for those domains (defaults)
44+ python link_checker.py --dir docs --substring http --validate-links --ci-mode
45+
4246Arguments:
4347 --dir: Directory containing markdown files to scan
4448 --files: List of specific markdown files to scan
4953 --timeout: Timeout for HTTP requests in seconds (default: 10)
5054 --url-mapping: Path segment mappings in format old=new (can be used multiple times)
5155 --ci-mode: CI mode: only report broken links and exit with error code on failures
56+ --ignore-429-domain: Domain for which HTTP 429 should be treated as a soft pass (can be used multiple times)
57+ --no-head-domain: Domain for which to skip HEAD and use GET directly (can be used multiple times)
58+ --user-agent: Custom User-Agent header to use for HTTP requests
5259
5360Note:
5461 The 'requests' package is required for link validation. Install it with:
6168import sys
6269from concurrent .futures import ThreadPoolExecutor , as_completed
6370from typing import Dict , List , Optional , Tuple
71+ from urllib .parse import urlparse
6472
6573try :
6674 import requests
7179except ImportError :
7280 HAS_REQUESTS = False
7381
82+ # Default policies for troublesome domains that frequently rate-limit automated traffic.
83+ # These defaults can be extended via CLI flags.
84+ DEFAULT_IGNORE_429_DOMAINS = {
85+ "developer.hashicorp.com" ,
86+ "terraform.io" ,
87+ "www.terraform.io" ,
88+ }
89+ DEFAULT_NO_HEAD_DOMAINS = {
90+ "developer.hashicorp.com" ,
91+ "terraform.io" ,
92+ "www.terraform.io" ,
93+ }
94+ DEFAULT_USER_AGENT = (
95+ "ZenML-LinkChecker/1.0 (+https://github.com/zenml-io/zenml)"
96+ )
97+
7498
7599def find_markdown_files (directory : str ) -> List [str ]:
76100 """Find all markdown files in the given directory and its subdirectories."""
@@ -227,14 +251,21 @@ def is_local_development_url(url: str) -> bool:
227251
228252
229253def check_link_validity (
230- url : str , timeout : int = 10
254+ url : str ,
255+ timeout : int = 10 ,
256+ ignore_429_domains : Optional [set ] = None ,
257+ no_head_domains : Optional [set ] = None ,
258+ user_agent : Optional [str ] = None ,
231259) -> Tuple [str , bool , Optional [str ], Optional [int ]]:
232260 """
233261 Check if a URL is valid by making an HTTP request.
234262
235263 Args:
236264 url: The URL to check
237265 timeout: Request timeout in seconds
266+ ignore_429_domains: Domains for which HTTP 429 should be considered a soft pass
267+ no_head_domains: Domains for which to skip HEAD and only use GET
268+ user_agent: Custom User-Agent header
238269
239270 Returns:
240271 Tuple of (url, is_valid, error_message, status_code)
@@ -258,36 +289,66 @@ def check_link_validity(
258289 if is_local_development_url (cleaned_url ):
259290 return url , True , None , None
260291
261- # Configure session with retries
292+ parsed = urlparse (cleaned_url )
293+ hostname = (parsed .hostname or "" ).lower ()
294+ ignore_429 = bool (ignore_429_domains ) and hostname in ignore_429_domains
295+ skip_head = bool (no_head_domains ) and hostname in no_head_domains
296+
297+ # Configure session with retries. We respect Retry-After and avoid raising
298+ # on final status, returning the last response instead. This allows us
299+ # to interpret 429 as a soft-pass for specific domains.
262300 session = requests .Session ()
263301 retries = Retry (
264302 total = 3 ,
265- backoff_factor = 0.5 ,
303+ backoff_factor = 1.0 ,
304+ respect_retry_after_header = True ,
266305 status_forcelist = [429 , 500 , 502 , 503 , 504 ],
267306 allowed_methods = ["HEAD" , "GET" ],
307+ raise_on_status = False ,
268308 )
269309 session .mount ("http://" , HTTPAdapter (max_retries = retries ))
270310 session .mount ("https://" , HTTPAdapter (max_retries = retries ))
311+ session .headers .update (
312+ {
313+ "User-Agent" : user_agent or DEFAULT_USER_AGENT ,
314+ "Accept" : "*/*" ,
315+ }
316+ )
271317
272318 try :
273- # First try with HEAD request
274- response = session .head (
275- cleaned_url , timeout = timeout , allow_redirects = True
276- )
319+ # Strategy: HEAD first unless the domain is known to dislike HEAD,
320+ # then fallback to GET if needed. Some sites rate-limit or block HEAD.
321+ response = None
322+ if not skip_head :
323+ response = session .head (
324+ cleaned_url , timeout = timeout , allow_redirects = True
325+ )
326+ else :
327+ response = session .get (
328+ cleaned_url , timeout = timeout , allow_redirects = True
329+ )
277330
278- # If HEAD fails, try GET
331+ # If HEAD fails (>=400) , try GET
279332 if response .status_code >= 400 :
280333 response = session .get (
281334 cleaned_url , timeout = timeout , allow_redirects = True
282335 )
283336
337+ # Soft-pass 429 for configured domains
338+ if response .status_code == 429 and ignore_429 :
339+ return (
340+ url ,
341+ True ,
342+ "429 rate-limited (soft-pass for domain)" ,
343+ 429 ,
344+ )
345+
284346 is_valid = response .status_code < 400
285347
286348 # Additional check for Gitbook URLs that return 200 for non-existent pages
287349 if is_valid and "docs.zenml.io" in cleaned_url :
288350 # We need to check for "noindex" meta tag which indicates a 404 page in Gitbook
289351 try :
290- # Use GET to fetch the page content
291352 content_response = session .get (cleaned_url , timeout = timeout )
292353 content = content_response .text .lower ()
293354
@@ -314,18 +375,35 @@ def check_link_validity(
314375 )
315376
316377 except requests .RequestException as e :
378+ # If we hit retry exhaustion with 429, soft-pass for configured domains
379+ if "429" in str (e ) and ignore_429 :
380+ return (
381+ url ,
382+ True ,
383+ "429 rate-limited (soft-pass for domain)" ,
384+ 429 ,
385+ )
317386 return url , False , str (e ), None
318387
319388
320389def validate_urls (
321- urls : List [str ], max_workers : int = 10
390+ urls : List [str ],
391+ max_workers : int = 10 ,
392+ timeout : int = 10 ,
393+ ignore_429_domains : Optional [set ] = None ,
394+ no_head_domains : Optional [set ] = None ,
395+ user_agent : Optional [str ] = None ,
322396) -> Dict [str , Tuple [bool , Optional [str ], Optional [int ]]]:
323397 """
324398 Validate multiple URLs in parallel.
325399
326400 Args:
327401 urls: List of URLs to validate
328402 max_workers: Maximum number of parallel workers
403+ timeout: Request timeout
404+ ignore_429_domains: Domains for which HTTP 429 should be considered a soft pass
405+ no_head_domains: Domains for which to skip HEAD and only use GET
406+ user_agent: Custom User-Agent header
329407
330408 Returns:
331409 Dictionary of {url: (is_valid, error_message, status_code)}
@@ -336,8 +414,6 @@ def validate_urls(
336414 results = {}
337415
338416 # Count and report GitHub links that will be skipped in validation
339- from urllib .parse import urlparse
340-
341417 github_urls = [
342418 url
343419 for url in urls
@@ -369,7 +445,14 @@ def validate_urls(
369445 # Submit all URLs (GitHub links will be auto-skipped in check_link_validity)
370446 for url in urls :
371447 future_to_url [
372- executor .submit (check_link_validity , url , timeout = 15 )
448+ executor .submit (
449+ check_link_validity ,
450+ url ,
451+ timeout = timeout ,
452+ ignore_429_domains = ignore_429_domains ,
453+ no_head_domains = no_head_domains ,
454+ user_agent = user_agent ,
455+ )
373456 ] = url
374457
375458 # Process results
@@ -384,7 +467,12 @@ def validate_urls(
384467 f" Checked URL { i } /{ len (urls )} [github.com]: ✓ Skipped (automatically marked valid)"
385468 )
386469 else :
387- status = "✅ Valid" if is_valid else f"❌ { error_message } "
470+ if is_valid and status_code == 429 :
471+ status = "✅ Valid (429 soft-pass)"
472+ else :
473+ status = (
474+ "✅ Valid" if is_valid else f"❌ { error_message } "
475+ )
388476 domain = (
389477 url .split ("/" )[2 ]
390478 if "://" in url and "/" in url .split ("://" , 1 )[1 ]
@@ -482,6 +570,11 @@ def replace_links_in_file(
482570 dry_run : bool = False ,
483571 validate_links : bool = False ,
484572 url_mappings : Dict [str , str ] = None ,
573+ * ,
574+ timeout : int = 10 ,
575+ ignore_429_domains : Optional [set ] = None ,
576+ no_head_domains : Optional [set ] = None ,
577+ user_agent : Optional [str ] = None ,
485578) -> Dict [str , Tuple [str , bool , Optional [str ]]]:
486579 """
487580 Replace relative links in the file with absolute URLs.
@@ -492,6 +585,10 @@ def replace_links_in_file(
492585 dry_run: If True, don't actually modify the file
493586 validate_links: If True, validate the generated links
494587 url_mappings: Dictionary of path segment mappings {old: new}
588+ timeout: HTTP timeout for validation requests
589+ ignore_429_domains: Domains for which HTTP 429 should be considered a soft pass
590+ no_head_domains: Domains for which to skip HEAD and only use GET
591+ user_agent: Custom User-Agent header
495592
496593 Returns:
497594 Dictionary of {original_link: (new_link, is_valid, error_message)}
@@ -596,7 +693,13 @@ def should_replace_link(link: str) -> bool:
596693 # Validate links if requested
597694 validation_results = {}
598695 if validate_links and transformed_urls :
599- validation_results = validate_urls (transformed_urls )
696+ validation_results = validate_urls (
697+ transformed_urls ,
698+ timeout = timeout ,
699+ ignore_429_domains = ignore_429_domains ,
700+ no_head_domains = no_head_domains ,
701+ user_agent = user_agent ,
702+ )
600703
601704 # Update the replacements dictionary with validation results
602705 for rel_link , (trans_link , _ , _ ) in replacements .items ():
@@ -731,6 +834,23 @@ def main():
731834 action = "store_true" ,
732835 help = "CI mode: only report broken links and exit with error code on failures" ,
733836 )
837+ parser .add_argument (
838+ "--ignore-429-domain" ,
839+ action = "append" ,
840+ default = [],
841+ help = "Domain for which to consider HTTP 429 as a soft pass (can be used multiple times). Defaults include developer.hashicorp.com and terraform.io." ,
842+ )
843+ parser .add_argument (
844+ "--no-head-domain" ,
845+ action = "append" ,
846+ default = [],
847+ help = "Domain for which to skip HEAD and only use GET (can be used multiple times). Defaults include developer.hashicorp.com and terraform.io." ,
848+ )
849+ parser .add_argument (
850+ "--user-agent" ,
851+ default = DEFAULT_USER_AGENT ,
852+ help = "User-Agent to use for HTTP requests." ,
853+ )
734854 args = parser .parse_args ()
735855
736856 # Check for requests module if validation is enabled
@@ -766,6 +886,14 @@ def main():
766886 if not args .ci_mode :
767887 print (f"Scanning { len (files_to_scan )} specified markdown files" )
768888
889+ # Merge defaults with CLI-provided domain policies
890+ ignore_429_domains = DEFAULT_IGNORE_429_DOMAINS .union (
891+ set (args .ignore_429_domain or [])
892+ )
893+ no_head_domains = DEFAULT_NO_HEAD_DOMAINS .union (
894+ set (args .no_head_domain or [])
895+ )
896+
769897 if args .replace_links :
770898 # Replace links mode
771899 total_replacements = 0
@@ -780,6 +908,10 @@ def main():
780908 args .dry_run ,
781909 args .validate_links ,
782910 url_mappings ,
911+ timeout = args .timeout ,
912+ ignore_429_domains = ignore_429_domains ,
913+ no_head_domains = no_head_domains ,
914+ user_agent = args .user_agent ,
783915 )
784916 if replacements :
785917 if not args .ci_mode :
@@ -887,7 +1019,13 @@ def main():
8871019 if args .validate_links and links_to_validate :
8881020 if not args .ci_mode :
8891021 print (f"\n Validating { len (links_to_validate )} links..." )
890- validation_results = validate_urls (list (set (links_to_validate )))
1022+ validation_results = validate_urls (
1023+ list (set (links_to_validate )),
1024+ timeout = args .timeout ,
1025+ ignore_429_domains = ignore_429_domains ,
1026+ no_head_domains = no_head_domains ,
1027+ user_agent = args .user_agent ,
1028+ )
8911029
8921030 valid_count = sum (
8931031 1 for result in validation_results .values () if result [0 ]
0 commit comments