Skip to content

Commit 192d50e

Browse files
[PATCH] Hardened documentation a little by normalizing unicode when sanitizing (- WIP #274 -)
Changes in file docs/utils.py: * improved to normalize netloc and path components of URLs * hardened slugify_header to also normalize unicode
1 parent 1e90003 commit 192d50e

File tree

1 file changed

+19
-14
lines changed

1 file changed

+19
-14
lines changed

docs/utils.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,32 +17,33 @@
1717
# limitations under the License.
1818

1919
import re
20-
from urllib.parse import urlparse, urlunparse, quote
20+
import unicodedata
21+
from urllib.parse import ParseResult, urlparse, urlunparse, quote
2122

2223

2324
# Git reference validation pattern
2425
# Enforces:
2526
# - Must start with alphanumeric character
2627
# - Can contain alphanumeric characters, underscore, hyphen, forward slash, and dot
27-
GIT_REF_PATTERN = r'^[a-zA-Z0-9][a-zA-Z0-9_\-./]*$'
28+
GIT_REF_PATTERN: str = r'^[a-zA-Z0-9][a-zA-Z0-9_\-./]*$'
2829

2930

3031
# URL allowed scheme list
3132
# Enforces:
3233
# - URLs Must start with https
33-
URL_ALLOWED_SCHEMES = frozenset({"https"})
34+
URL_ALLOWED_SCHEMES: list = frozenset({"https"})
3435

3536

3637
# URL allowed domain list
3738
# Enforces:
3839
# - URLs Must belong to one of these domains
39-
URL_ALLOWED_NETLOCS = frozenset({
40+
URL_ALLOWED_NETLOCS: list = frozenset({
4041
"github.com", "gist.github.com", "readthedocs.com", "docs.python.org", "peps.python.org",
4142
})
4243

4344

4445
# Maximum allowed URL length
45-
MAX_URL_LENGTH = 2048 # Common browser limit
46+
MAX_URL_LENGTH: int = 2048 # Common browser limit
4647
"""Maximum allowed length for URL validation.
4748
4849
Should be large enough for most URLs but no larger than common browser limits.
@@ -70,7 +71,7 @@
7071

7172

7273
# Error messages for URL validation
73-
INVALID_LENGTH_ERROR = f"URL exceeds maximum length of {MAX_URL_LENGTH} characters."
74+
INVALID_LENGTH_ERROR: str = f"URL exceeds maximum length of {MAX_URL_LENGTH} characters."
7475
"""Length error message for URL validation.
7576
7677
Unit-Testing:
@@ -91,7 +92,7 @@
9192
"""
9293

9394

94-
INVALID_SCHEME_ERROR = "Invalid URL scheme. Only 'https' is allowed."
95+
INVALID_SCHEME_ERROR: str = "Invalid URL scheme. Only 'https' is allowed."
9596
"""Scheme error message for URL validation.
9697
9798
Unit-Testing:
@@ -112,7 +113,7 @@
112113
"""
113114

114115

115-
INVALID_DOMAIN_ERROR = f"Invalid or untrusted domain. Only {URL_ALLOWED_NETLOCS} are allowed."
116+
INVALID_DOMAIN_ERROR: str = f"Invalid or untrusted domain. Only {URL_ALLOWED_NETLOCS} are allowed."
116117
"""Domain error message for URL validation.
117118
118119
Unit-Testing:
@@ -229,8 +230,10 @@ def slugify_header(s: str) -> str:
229230
>>> slugify_header("[CEP-7] Documentation *Guide*")
230231
'cep-7-documentation-guide'
231232
"""
232-
# First, remove special characters and convert to lowercase
233-
text = re.sub(r'[^\w\- ]', "", s).strip().lower()
233+
# First Normalize Unicode characters
234+
text: str = unicodedata.normalize('NFKC', s) # added in v2.0.9a6
235+
# Then, remove special characters and convert to lowercase
236+
text = re.sub(r'[^\w\- ]', "", text).strip().lower()
234237
# Then replace consecutive spaces or dashes with a single dash
235238
return re.sub(r'[-\s]+', "-", text)
236239

@@ -270,20 +273,22 @@ def sanitize_url(url: str) -> str:
270273
# Validate length
271274
if len(url) > MAX_URL_LENGTH:
272275
raise ValueError(INVALID_LENGTH_ERROR)
273-
parsed_url = urlparse(url)
276+
parsed_url: ParseResult = urlparse(url)
274277
# Validate scheme
275278
if parsed_url.scheme not in URL_ALLOWED_SCHEMES:
276279
raise ValueError(INVALID_SCHEME_ERROR)
277280
# Validate netloc
278281
if parsed_url.netloc not in URL_ALLOWED_NETLOCS:
279282
raise ValueError(INVALID_DOMAIN_ERROR)
283+
# Normalize netloc
284+
sanitized_netloc: str = unicodedata.normalize('NFKC', parsed_url.netloc) # added in v2.0.9a6
280285
# Sanitize path and query - using the safe parameter to preserve URL structure
281-
sanitized_path = quote(parsed_url.path, safe="/=")
282-
sanitized_query = quote(parsed_url.query, safe="&=")
286+
sanitized_path: str = quote(unicodedata.normalize('NFKC', parsed_url.path), safe="/=")
287+
sanitized_query: str = quote(parsed_url.query, safe="&=")
283288
# Reconstruct the sanitized URL
284289
return urlunparse((
285290
parsed_url.scheme,
286-
parsed_url.netloc,
291+
sanitized_netloc,
287292
sanitized_path,
288293
parsed_url.params,
289294
sanitized_query,

0 commit comments

Comments
 (0)