|
17 | 17 | # limitations under the License. |
18 | 18 |
|
19 | 19 | import re |
| 20 | +from urllib.parse import urlparse, urlunparse, quote |
20 | 21 |
|
21 | 22 |
|
22 | 23 | # Git reference validation pattern |
|
26 | 27 | GIT_REF_PATTERN = r'^[a-zA-Z0-9][a-zA-Z0-9_\-./]*$' |
27 | 28 |
|
28 | 29 |
|
| 30 | +# URL allowed scheme list |
| 31 | +# Enforces: |
| 32 | +# - URLs Must start with https |
| 33 | +URL_ALLOWED_SCHEMES = frozenset({"https"}) |
| 34 | + |
| 35 | + |
| 36 | +# URL allowed domain list |
| 37 | +# Enforces: |
| 38 | +# - URLs Must belong to one of these domains |
| 39 | +URL_ALLOWED_NETLOCS = frozenset({"github.com", "readthedocs.com", "docs.python.org"}) |
| 40 | + |
| 41 | + |
| 42 | +# Maximum allowed URL length |
| 43 | +MAX_URL_LENGTH = 2048 # Common browser limit |
| 44 | +"""Maximum allowed length for URL validation. |
| 45 | +
|
| 46 | +Should be large enough for most URLs but no larger than common browser limits. |
| 47 | +
|
| 48 | +Unit-Testing: |
| 49 | +
|
| 50 | + First set up test fixtures by importing utils. |
| 51 | +
|
| 52 | + >>> import docs.utils as _utils |
| 53 | + >>> |
| 54 | +
|
| 55 | + >>> _utils.MAX_URL_LENGTH is not None |
| 56 | + True |
| 57 | + >>> type(_utils.MAX_URL_LENGTH) is type(int()) |
| 58 | + True |
| 59 | + >>> _utils.MAX_URL_LENGTH > 0 |
| 60 | + True |
| 61 | + >>> _utils.MAX_URL_LENGTH >= 256 |
| 62 | + True |
| 63 | + >>> _utils.MAX_URL_LENGTH <= 2048 |
| 64 | + True |
| 65 | + >>> |
| 66 | +
|
| 67 | +""" |
| 68 | + |
| 69 | + |
| 70 | +# Error messages for URL validation |
| 71 | +INVALID_LENGTH_ERROR = f"URL exceeds maximum length of {MAX_URL_LENGTH} characters." |
| 72 | +"""Length error message for URL validation. |
| 73 | +
|
| 74 | +Unit-Testing: |
| 75 | +
|
| 76 | + First set up test fixtures by importing utils. |
| 77 | +
|
| 78 | + >>> import docs.utils as _utils |
| 79 | + >>> |
| 80 | +
|
| 81 | + >>> _utils.INVALID_LENGTH_ERROR is not None |
| 82 | + True |
| 83 | + >>> type(_utils.INVALID_LENGTH_ERROR) is type(str()) |
| 84 | + True |
| 85 | + >>> len(_utils.INVALID_LENGTH_ERROR) > 0 |
| 86 | + True |
| 87 | + >>> |
| 88 | +
|
| 89 | +""" |
| 90 | + |
| 91 | + |
| 92 | +INVALID_SCHEME_ERROR = "Invalid URL scheme. Only 'https' is allowed." |
| 93 | +"""Scheme error message for URL validation. |
| 94 | +
|
| 95 | +Unit-Testing: |
| 96 | +
|
| 97 | + First set up test fixtures by importing utils. |
| 98 | +
|
| 99 | + >>> import docs.utils as _utils |
| 100 | + >>> |
| 101 | +
|
| 102 | + >>> _utils.INVALID_SCHEME_ERROR is not None |
| 103 | + True |
| 104 | + >>> type(_utils.INVALID_SCHEME_ERROR) is type(str()) |
| 105 | + True |
| 106 | + >>> len(_utils.INVALID_SCHEME_ERROR) > 0 |
| 107 | + True |
| 108 | + >>> |
| 109 | +
|
| 110 | +""" |
| 111 | + |
| 112 | + |
| 113 | +INVALID_DOMAIN_ERROR = f"Invalid or untrusted domain. Only {URL_ALLOWED_NETLOCS} are allowed." |
| 114 | +"""Domain error message for URL validation. |
| 115 | +
|
| 116 | +Unit-Testing: |
| 117 | +
|
| 118 | + First set up test fixtures by importing utils. |
| 119 | +
|
| 120 | + >>> import docs.utils as _utils |
| 121 | + >>> |
| 122 | +
|
| 123 | + >>> _utils.INVALID_DOMAIN_ERROR is not None |
| 124 | + True |
| 125 | + >>> type(_utils.INVALID_DOMAIN_ERROR) is type(str()) |
| 126 | + True |
| 127 | + >>> len(_utils.INVALID_DOMAIN_ERROR) > 0 |
| 128 | + True |
| 129 | + >>> |
| 130 | +
|
| 131 | +""" |
| 132 | + |
| 133 | + |
29 | 134 | def _validate_git_ref(ref: str) -> str: |
30 | 135 | """ |
31 | 136 | Validate if the provided string is a valid Git reference. |
@@ -126,3 +231,92 @@ def slugify_header(s: str) -> str: |
126 | 231 | text = re.sub(r'[^\w\- ]', "", s).strip().lower() |
127 | 232 | # Then replace consecutive spaces or dashes with a single dash |
128 | 233 | return re.sub(r'[-\s]+', "-", text) |
| 234 | + |
| 235 | + |
| 236 | +def sanitize_url(url: str) -> str: |
| 237 | + """ |
| 238 | + Sanitize and validate a URL according to allowed schemes and domains. |
| 239 | +
|
| 240 | + This function validates that the URL uses an allowed scheme (https) and points |
| 241 | + to a trusted domain, then safely encodes its path and query components. |
| 242 | +
|
| 243 | + Args: |
| 244 | + url (str) -- The URL to sanitize. |
| 245 | +
|
| 246 | + Returns: |
| 247 | + str -- The sanitized URL. |
| 248 | +
|
| 249 | + Raises: |
| 250 | + ValueError -- If the URL has an invalid scheme or points to an untrusted domain. |
| 251 | +
|
| 252 | +
|
| 253 | + Unit-Testing: |
| 254 | +
|
| 255 | + Testcase 0: First set up test fixtures by importing utils. |
| 256 | +
|
| 257 | + >>> import docs.utils as _utils |
| 258 | + >>> |
| 259 | +
|
| 260 | + Testcase 1: Basic URL with spaces and special characters. |
| 261 | +
|
| 262 | + >>> url_fxtr = "https://github.com/user/Hello World!" |
| 263 | + >>> _utils.sanitize_url(url_fxtr) |
| 264 | + 'https://github.com/user/Hello%20World%21' |
| 265 | + >>> |
| 266 | +
|
| 267 | + """ |
| 268 | + # Validate length |
| 269 | + if len(url) > MAX_URL_LENGTH: |
| 270 | + raise ValueError(INVALID_LENGTH_ERROR) |
| 271 | + parsed_url = urlparse(url) |
| 272 | + # Validate scheme |
| 273 | + if parsed_url.scheme not in URL_ALLOWED_SCHEMES: |
| 274 | + raise ValueError(INVALID_SCHEME_ERROR) |
| 275 | + # Validate netloc |
| 276 | + if parsed_url.netloc not in URL_ALLOWED_NETLOCS: |
| 277 | + raise ValueError(INVALID_DOMAIN_ERROR) |
| 278 | + # Sanitize path and query - using the safe parameter to preserve URL structure |
| 279 | + sanitized_path = quote(parsed_url.path, safe="/=") |
| 280 | + sanitized_query = quote(parsed_url.query, safe="&=") |
| 281 | + # Reconstruct the sanitized URL |
| 282 | + return urlunparse(( |
| 283 | + parsed_url.scheme, |
| 284 | + parsed_url.netloc, |
| 285 | + sanitized_path, |
| 286 | + parsed_url.params, |
| 287 | + sanitized_query, |
| 288 | + parsed_url.fragment, |
| 289 | + )) |
| 290 | + |
| 291 | + |
| 292 | +def sanitize_intersphinx_mapping(mapping: dict) -> dict: |
| 293 | + """ |
| 294 | + Sanitize URLs in an intersphinx mapping dictionary. |
| 295 | +
|
| 296 | + This function applies URL sanitization to each URL in the mapping while |
| 297 | + preserving the associated extra values. |
| 298 | +
|
| 299 | + Args: |
| 300 | + mapping (dict) -- A dictionary mapping names to tuples of (url, extra_value). |
| 301 | +
|
| 302 | + Returns: |
| 303 | + dict -- A dictionary with the same structure but with sanitized URLs. |
| 304 | +
|
| 305 | + Unit-Testing: |
| 306 | +
|
| 307 | + Testcase 1: Basic intersphinx mapping. |
| 308 | +
|
| 309 | + >>> mapping = {'python': ('https://docs.python.org/3', None)} |
| 310 | + >>> sanitize_intersphinx_mapping(mapping) |
| 311 | + {'python': ('https://docs.python.org/3', None)} |
| 312 | +
|
| 313 | + Testcase 2: Mapping with URL containing special characters. |
| 314 | +
|
| 315 | + >>> mapping = {'project': ('https://github.com/user/project with spaces', None)} |
| 316 | + >>> result = sanitize_intersphinx_mapping(mapping) |
| 317 | + >>> result['project'][0] |
| 318 | + 'https://github.com/user/project%20with%20spaces' |
| 319 | + >>> |
| 320 | +
|
| 321 | + """ |
| 322 | + return {key: (sanitize_url(url), extra_value) for key, (url, extra_value) in mapping.items()} |
0 commit comments