|
79 | 79 | except ImportError: |
80 | 80 | HAS_REQUESTS = False |
81 | 81 |
|
| 82 | +# ------------------------------------------------------------------------------ |
| 83 | +# Centralized exceptions for HTTP status codes we consider acceptable for |
| 84 | +# specific URLs or domains during link validation. |
| 85 | +# |
| 86 | +# How to extend: |
| 87 | +# - To allow particular statuses for a *single* URL, add an entry to |
| 88 | +# EXEMPT_URL_STATUS with the normalized URL (no trailing slash) and a set |
| 89 | +# of allowed status codes. |
| 90 | +# - To allow particular statuses for an entire *domain*, add an entry to |
| 91 | +# EXEMPT_DOMAIN_STATUS with the domain suffix (e.g. "example.com") and a |
| 92 | +# set of allowed status codes. Matching uses host.endswith(domain). |
| 93 | +# ------------------------------------------------------------------------------ |
| 94 | + |
| 95 | +EXEMPT_URL_STATUS: Dict[str, set] = { |
| 96 | + # Requires authentication; often returns 403 to unauthenticated HEAD/GET. |
| 97 | + "https://huggingface.co/settings/tokens": {403}, |
| 98 | +} |
| 99 | + |
| 100 | +EXEMPT_DOMAIN_STATUS: Dict[str, set] = { |
| 101 | + # Some HashiCorp properties rate-limit automated checks. |
| 102 | + "hashicorp.com": {429}, |
| 103 | + "developer.hashicorp.com": {429}, |
| 104 | + "terraform.io": {429}, |
| 105 | + "www.terraform.io": {429}, |
| 106 | +} |
| 107 | + |
82 | 108 | # Default policies for troublesome domains that frequently rate-limit automated traffic. |
83 | 109 | # These defaults can be extended via CLI flags. |
84 | 110 | DEFAULT_IGNORE_429_DOMAINS = { |
|
96 | 122 | ) |
97 | 123 |
|
98 | 124 |
|
| 125 | +def is_exception_status(cleaned_url: str, status_code: int) -> bool: |
| 126 | + """ |
| 127 | + Return True if (cleaned_url, status_code) should be treated as valid based |
| 128 | + on configured exemptions. |
| 129 | +
|
| 130 | + Rules: |
| 131 | + - Exact-URL exemptions: compared after stripping any trailing slash. |
| 132 | + - Domain exemptions: suffix match against the URL hostname. |
| 133 | + """ |
| 134 | + if not cleaned_url.startswith(("http://", "https://")): |
| 135 | + return False |
| 136 | + |
| 137 | + # Exact URL exemptions (normalize by removing trailing slash) |
| 138 | + normalized = cleaned_url.rstrip("/") |
| 139 | + allowed = EXEMPT_URL_STATUS.get(normalized) |
| 140 | + if allowed and status_code in allowed: |
| 141 | + return True |
| 142 | + |
| 143 | + # Domain exemptions |
| 144 | + try: |
| 145 | + host = urlparse(cleaned_url).hostname or "" |
| 146 | + except Exception: |
| 147 | + host = "" |
| 148 | + |
| 149 | + for domain, codes in EXEMPT_DOMAIN_STATUS.items(): |
| 150 | + if host.endswith(domain) and status_code in codes: |
| 151 | + return True |
| 152 | + |
| 153 | + return False |
| 154 | + |
| 155 | + |
99 | 156 | def find_markdown_files(directory: str) -> List[str]: |
100 | 157 | """Find all markdown files in the given directory and its subdirectories.""" |
101 | 158 | markdown_files = [] |
@@ -334,7 +391,11 @@ def check_link_validity( |
334 | 391 | cleaned_url, timeout=timeout, allow_redirects=True |
335 | 392 | ) |
336 | 393 |
|
337 | | - # Soft-pass 429 for configured domains |
| 394 | + # Check configured exception rules (domain- and URL-specific) |
| 395 | + if is_exception_status(cleaned_url, response.status_code): |
| 396 | + return (url, True, None, response.status_code) |
| 397 | + |
| 398 | + # Also check for 429 soft-pass for configured domains (backward compatibility) |
338 | 399 | if response.status_code == 429 and ignore_429: |
339 | 400 | return ( |
340 | 401 | url, |
|
0 commit comments