diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index b2bde5a9b1d696..62875fda904857 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1227,6 +1227,13 @@ def test_parse_qs_encoding(self): errors="ignore") self.assertEqual(result, {'key': ['\u0141-']}) + def test_qsl_strict_parsing_raises(self): + with self.assertRaises(ValueError): + urllib.parse.parse_qsl("foo=^", strict_parsing=True) + + with self.assertRaises(ValueError): + urllib.parse.parse_qsl(b"foo=`", strict_parsing=True) + def test_parse_qsl_encoding(self): result = urllib.parse.parse_qsl("key=\u0141%E9", encoding="latin-1") self.assertEqual(result, [('key', '\u0141\xE9')]) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 67d9bbea0d3150..45d062cc11ff65 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -91,6 +91,9 @@ # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] +# Allowed valid characters in parse_qsl as per RFC 3986. +_VALID_RFC3986_QUERY_CHARS = "-._~!$&'()*+,;=:@/?%" + def clear_cache(): """Clear internal performance caches. Undocumented; some tests want it.""" urlsplit.cache_clear() @@ -778,6 +781,15 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False, parsed_result[name] = [value] return parsed_result +def _is_valid_rfc3986_query(chars): + """Return True if all characters are valid per RFC 3986.""" + for ch in chars: + if not ch.isascii(): + return False + if ch.isalnum() or ch in _VALID_RFC3986_QUERY_CHARS: + continue + return False + return True def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None, separator='&', *, _stacklevel=1): @@ -854,6 +866,13 @@ def _unquote(s): name, has_eq, value = name_value.partition(eq) if not has_eq and strict_parsing: raise ValueError("bad query field: %r" % (name_value,)) + if strict_parsing: + # Validate RFC3986 characters + to_check = _unquote(name_value) + if isinstance(to_check, (bytes, bytearray)): + to_check = to_check.decode(encoding, errors) + if not _is_valid_rfc3986_query(to_check): + raise ValueError(f"Invalid characters in query string per RFC 3986: {name_value!r}") if value or keep_blank_values: name = _unquote(name) value = _unquote(value) diff --git a/Misc/NEWS.d/next/Library/2025-08-31-13-00-22.gh-issue-138284.6MOp4k.rst b/Misc/NEWS.d/next/Library/2025-08-31-13-00-22.gh-issue-138284.6MOp4k.rst new file mode 100644 index 00000000000000..9e4c88f08c2c73 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-08-31-13-00-22.gh-issue-138284.6MOp4k.rst @@ -0,0 +1 @@ +:mod:`urllib.parse`: in strict parsing, :func:`~urllib.parse.parse_qsl` now properly rejects characters according to :rfc:`3986` and raises a :exc:`ValueError` when encountering them.