|
17 | 17 | # limitations under the License. |
18 | 18 |
|
19 | 19 | import re |
20 | | -from urllib.parse import urlparse, urlunparse, quote |
| 20 | +import unicodedata |
| 21 | +from urllib.parse import ParseResult, urlparse, urlunparse, quote |
21 | 22 |
|
22 | 23 |
|
23 | 24 | # Git reference validation pattern |
24 | 25 | # Enforces: |
25 | 26 | # - Must start with alphanumeric character |
26 | 27 | # - Can contain alphanumeric characters, underscore, hyphen, forward slash, and dot |
27 | | -GIT_REF_PATTERN = r'^[a-zA-Z0-9][a-zA-Z0-9_\-./]*$' |
| 28 | +GIT_REF_PATTERN: str = r'^[a-zA-Z0-9][a-zA-Z0-9_\-./]*$' |
28 | 29 |
|
29 | 30 |
|
30 | 31 | # URL allowed scheme list |
31 | 32 | # Enforces: |
32 | 33 | # - URLs Must start with https |
33 | | -URL_ALLOWED_SCHEMES = frozenset({"https"}) |
| 34 | +URL_ALLOWED_SCHEMES: list = frozenset({"https"}) |
34 | 35 |
|
35 | 36 |
|
36 | 37 | # URL allowed domain list |
37 | 38 | # Enforces: |
38 | 39 | # - URLs Must belong to one of these domains |
39 | | -URL_ALLOWED_NETLOCS = frozenset({ |
| 40 | +URL_ALLOWED_NETLOCS: list = frozenset({ |
40 | 41 | "github.com", "gist.github.com", "readthedocs.com", "docs.python.org", "peps.python.org", |
41 | 42 | }) |
42 | 43 |
|
43 | 44 |
|
44 | 45 | # Maximum allowed URL length |
45 | | -MAX_URL_LENGTH = 2048 # Common browser limit |
| 46 | +MAX_URL_LENGTH: int = 2048 # Common browser limit |
46 | 47 | """Maximum allowed length for URL validation. |
47 | 48 |
|
48 | 49 | Should be large enough for most URLs but no larger than common browser limits. |
|
70 | 71 |
|
71 | 72 |
|
72 | 73 | # Error messages for URL validation |
73 | | -INVALID_LENGTH_ERROR = f"URL exceeds maximum length of {MAX_URL_LENGTH} characters." |
| 74 | +INVALID_LENGTH_ERROR: str = f"URL exceeds maximum length of {MAX_URL_LENGTH} characters." |
74 | 75 | """Length error message for URL validation. |
75 | 76 |
|
76 | 77 | Unit-Testing: |
|
91 | 92 | """ |
92 | 93 |
|
93 | 94 |
|
94 | | -INVALID_SCHEME_ERROR = "Invalid URL scheme. Only 'https' is allowed." |
| 95 | +INVALID_SCHEME_ERROR: str = "Invalid URL scheme. Only 'https' is allowed." |
95 | 96 | """Scheme error message for URL validation. |
96 | 97 |
|
97 | 98 | Unit-Testing: |
|
112 | 113 | """ |
113 | 114 |
|
114 | 115 |
|
115 | | -INVALID_DOMAIN_ERROR = f"Invalid or untrusted domain. Only {URL_ALLOWED_NETLOCS} are allowed." |
| 116 | +INVALID_DOMAIN_ERROR: str = f"Invalid or untrusted domain. Only {URL_ALLOWED_NETLOCS} are allowed." |
116 | 117 | """Domain error message for URL validation. |
117 | 118 |
|
118 | 119 | Unit-Testing: |
@@ -229,8 +230,10 @@ def slugify_header(s: str) -> str: |
229 | 230 | >>> slugify_header("[CEP-7] Documentation *Guide*") |
230 | 231 | 'cep-7-documentation-guide' |
231 | 232 | """ |
232 | | - # First, remove special characters and convert to lowercase |
233 | | - text = re.sub(r'[^\w\- ]', "", s).strip().lower() |
| 233 | + # First Normalize Unicode characters |
| 234 | + text: str = unicodedata.normalize('NFKC', s) # added in v2.0.9a6 |
| 235 | + # Then, remove special characters and convert to lowercase |
| 236 | + text = re.sub(r'[^\w\- ]', "", text).strip().lower() |
234 | 237 | # Then replace consecutive spaces or dashes with a single dash |
235 | 238 | return re.sub(r'[-\s]+', "-", text) |
236 | 239 |
|
@@ -270,20 +273,22 @@ def sanitize_url(url: str) -> str: |
270 | 273 | # Validate length |
271 | 274 | if len(url) > MAX_URL_LENGTH: |
272 | 275 | raise ValueError(INVALID_LENGTH_ERROR) |
273 | | - parsed_url = urlparse(url) |
| 276 | + parsed_url: ParseResult = urlparse(url) |
274 | 277 | # Validate scheme |
275 | 278 | if parsed_url.scheme not in URL_ALLOWED_SCHEMES: |
276 | 279 | raise ValueError(INVALID_SCHEME_ERROR) |
277 | 280 | # Validate netloc |
278 | 281 | if parsed_url.netloc not in URL_ALLOWED_NETLOCS: |
279 | 282 | raise ValueError(INVALID_DOMAIN_ERROR) |
| 283 | + # Normalize netloc |
| 284 | + sanitized_netloc: str = unicodedata.normalize('NFKC', parsed_url.netloc) # added in v2.0.9a6 |
280 | 285 | # Sanitize path and query - using the safe parameter to preserve URL structure |
281 | | - sanitized_path = quote(parsed_url.path, safe="/=") |
282 | | - sanitized_query = quote(parsed_url.query, safe="&=") |
| 286 | + sanitized_path: str = quote(unicodedata.normalize('NFKC', parsed_url.path), safe="/=") |
| 287 | + sanitized_query: str = quote(parsed_url.query, safe="&=") |
283 | 288 | # Reconstruct the sanitized URL |
284 | 289 | return urlunparse(( |
285 | 290 | parsed_url.scheme, |
286 | | - parsed_url.netloc, |
| 291 | + sanitized_netloc, |
287 | 292 | sanitized_path, |
288 | 293 | parsed_url.params, |
289 | 294 | sanitized_query, |
|
0 commit comments