From a60c9beb50050fe1dde3352dea7136073c85c606 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 22 Aug 2024 10:00:19 +0300 Subject: [PATCH 1/7] gh-67041: Allow to distinguish between empty and not defined URI components Changes in the urllib.parse module: * Add option allow_none in urlparse(), urlsplit() and urldefrag(). If it is true, represent not defined components as None instead of an empty string. * Add option keep_empty in urlunparse() and urlunsplit(). If it is true, keep empty non-None components in the resulting string. * Add option keep_empty in the geturl() method of DefragResult, SplitResult, ParseResult and the corresponding bytes counterparts. --- Lib/test/test_urlparse.py | 434 +++++++++++++++++++++----------------- Lib/urllib/parse.py | 137 ++++++++---- 2 files changed, 335 insertions(+), 236 deletions(-) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index d49e4388696ab4..f845f0c00b0072 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1,3 +1,4 @@ +import functools import sys import unicodedata import unittest @@ -101,25 +102,40 @@ (b"%81=%A9", {b'\x81': [b'\xa9']}), ] +def parametrise_allow_none(test): + @functools.wraps(test) + def wrapper(self): + for allow_none in False, True: + with self.subTest(allow_none=allow_none): + test(self, allow_none=allow_none) + return wrapper + class UrlParseTestCase(unittest.TestCase): - def checkRoundtrips(self, url, parsed, split, url2=None): + def _encode(self, s): + if isinstance(s, str): + return s.encode('ascii') + if isinstance(s, tuple): + return tuple(self._encode(x) for x in s) + return s + + def checkRoundtrips(self, url, parsed, split, url2=None, *, allow_none=True): if url2 is None: url2 = url - result = urllib.parse.urlparse(url) + result = urllib.parse.urlparse(url, allow_none=allow_none) self.assertSequenceEqual(result, parsed) t = (result.scheme, result.netloc, result.path, result.params, result.query, result.fragment) self.assertSequenceEqual(t, parsed) # put it back together and it should be the same - result2 = urllib.parse.urlunparse(result) + result2 = urllib.parse.urlunparse(result, keep_empty=allow_none) self.assertSequenceEqual(result2, url2) - self.assertSequenceEqual(result2, result.geturl()) + self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none)) # the result of geturl() is a fixpoint; we can always parse it # again to get the same result: - result3 = urllib.parse.urlparse(result.geturl()) - self.assertEqual(result3.geturl(), result.geturl()) + result3 = urllib.parse.urlparse(result.geturl(keep_empty=allow_none), allow_none=allow_none) + self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none)) self.assertSequenceEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) self.assertEqual(result3.netloc, result.netloc) @@ -133,18 +149,18 @@ def checkRoundtrips(self, url, parsed, split, url2=None): self.assertEqual(result3.port, result.port) # check the roundtrip using urlsplit() as well - result = urllib.parse.urlsplit(url) + result = urllib.parse.urlsplit(url, allow_none=allow_none) self.assertSequenceEqual(result, split) t = (result.scheme, result.netloc, result.path, result.query, result.fragment) self.assertSequenceEqual(t, split) - result2 = urllib.parse.urlunsplit(result) + result2 = urllib.parse.urlunsplit(result, keep_empty=allow_none) self.assertSequenceEqual(result2, url2) - self.assertSequenceEqual(result2, result.geturl()) + self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none)) # check the fixpoint property of re-parsing the result of geturl() - result3 = urllib.parse.urlsplit(result.geturl()) - self.assertEqual(result3.geturl(), result.geturl()) + result3 = urllib.parse.urlsplit(result.geturl(keep_empty=allow_none), allow_none=allow_none) + self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none)) self.assertSequenceEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) self.assertEqual(result3.netloc, result.netloc) @@ -178,135 +194,130 @@ def test_qs(self): def test_roundtrips(self): str_cases = [ ('path/to/file', - ('', '', 'path/to/file', '', '', ''), - ('', '', 'path/to/file', '', '')), + (None, None, 'path/to/file', None, None, None), + (None, None, 'path/to/file', None, None)), ('/path/to/file', - ('', '', '/path/to/file', '', '', ''), - ('', '', '/path/to/file', '', '')), + (None, None, '/path/to/file', None, None, None), + (None, None, '/path/to/file', None, None)), ('//path/to/file', - ('', 'path', '/to/file', '', '', ''), - ('', 'path', '/to/file', '', '')), + (None, 'path', '/to/file', None, None, None), + (None, 'path', '/to/file', None, None)), ('////path/to/file', - ('', '', '//path/to/file', '', '', ''), - ('', '', '//path/to/file', '', '')), + (None, '', '//path/to/file', None, None, None), + (None, '', '//path/to/file', None, None)), ('/////path/to/file', - ('', '', '///path/to/file', '', '', ''), - ('', '', '///path/to/file', '', '')), + (None, '', '///path/to/file', None, None, None), + (None, '', '///path/to/file', None, None)), ('scheme:path/to/file', - ('scheme', '', 'path/to/file', '', '', ''), - ('scheme', '', 'path/to/file', '', '')), + ('scheme', None, 'path/to/file', None, None, None), + ('scheme', None, 'path/to/file', None, None)), ('scheme:/path/to/file', - ('scheme', '', '/path/to/file', '', '', ''), - ('scheme', '', '/path/to/file', '', '')), + ('scheme', None, '/path/to/file', None, None, None), + ('scheme', None, '/path/to/file', None, None)), ('scheme://path/to/file', - ('scheme', 'path', '/to/file', '', '', ''), - ('scheme', 'path', '/to/file', '', '')), + ('scheme', 'path', '/to/file', None, None, None), + ('scheme', 'path', '/to/file', None, None)), ('scheme:////path/to/file', - ('scheme', '', '//path/to/file', '', '', ''), - ('scheme', '', '//path/to/file', '', '')), + ('scheme', '', '//path/to/file', None, None, None), + ('scheme', '', '//path/to/file', None, None)), ('scheme://///path/to/file', - ('scheme', '', '///path/to/file', '', '', ''), - ('scheme', '', '///path/to/file', '', '')), + ('scheme', '', '///path/to/file', None, None, None), + ('scheme', '', '///path/to/file', None, None)), ('file:tmp/junk.txt', - ('file', '', 'tmp/junk.txt', '', '', ''), - ('file', '', 'tmp/junk.txt', '', '')), + ('file', None, 'tmp/junk.txt', None, None, None), + ('file', None, 'tmp/junk.txt', None, None)), ('file:///tmp/junk.txt', - ('file', '', '/tmp/junk.txt', '', '', ''), - ('file', '', '/tmp/junk.txt', '', '')), + ('file', '', '/tmp/junk.txt', None, None, None), + ('file', '', '/tmp/junk.txt', None, None)), ('file:////tmp/junk.txt', - ('file', '', '//tmp/junk.txt', '', '', ''), - ('file', '', '//tmp/junk.txt', '', '')), + ('file', '', '//tmp/junk.txt', None, None, None), + ('file', '', '//tmp/junk.txt', None, None)), ('file://///tmp/junk.txt', - ('file', '', '///tmp/junk.txt', '', '', ''), - ('file', '', '///tmp/junk.txt', '', '')), + ('file', '', '///tmp/junk.txt', None, None, None), + ('file', '', '///tmp/junk.txt', None, None)), ('http:tmp/junk.txt', - ('http', '', 'tmp/junk.txt', '', '', ''), - ('http', '', 'tmp/junk.txt', '', '')), + ('http', None, 'tmp/junk.txt', None, None, None), + ('http', None, 'tmp/junk.txt', None, None)), ('http://example.com/tmp/junk.txt', - ('http', 'example.com', '/tmp/junk.txt', '', '', ''), - ('http', 'example.com', '/tmp/junk.txt', '', '')), + ('http', 'example.com', '/tmp/junk.txt', None, None, None), + ('http', 'example.com', '/tmp/junk.txt', None, None)), ('http:///example.com/tmp/junk.txt', - ('http', '', '/example.com/tmp/junk.txt', '', '', ''), - ('http', '', '/example.com/tmp/junk.txt', '', '')), + ('http', '', '/example.com/tmp/junk.txt', None, None, None), + ('http', '', '/example.com/tmp/junk.txt', None, None)), ('http:////example.com/tmp/junk.txt', - ('http', '', '//example.com/tmp/junk.txt', '', '', ''), - ('http', '', '//example.com/tmp/junk.txt', '', '')), + ('http', '', '//example.com/tmp/junk.txt', None, None, None), + ('http', '', '//example.com/tmp/junk.txt', None, None)), ('imap://mail.python.org/mbox1', - ('imap', 'mail.python.org', '/mbox1', '', '', ''), - ('imap', 'mail.python.org', '/mbox1', '', '')), + ('imap', 'mail.python.org', '/mbox1', None, None, None), + ('imap', 'mail.python.org', '/mbox1', None, None)), ('mms://wms.sys.hinet.net/cts/Drama/09006251100.asf', ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', - '', '', ''), + None, None, None), ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', - '', '')), + None, None)), ('nfs://server/path/to/file.txt', - ('nfs', 'server', '/path/to/file.txt', '', '', ''), - ('nfs', 'server', '/path/to/file.txt', '', '')), + ('nfs', 'server', '/path/to/file.txt', None, None, None), + ('nfs', 'server', '/path/to/file.txt', None, None)), ('svn+ssh://svn.zope.org/repos/main/ZConfig/trunk/', ('svn+ssh', 'svn.zope.org', '/repos/main/ZConfig/trunk/', - '', '', ''), + None, None, None), ('svn+ssh', 'svn.zope.org', '/repos/main/ZConfig/trunk/', - '', '')), + None, None)), ('git+ssh://git@github.com/user/project.git', ('git+ssh', 'git@github.com','/user/project.git', - '','',''), + None,None,None), ('git+ssh', 'git@github.com','/user/project.git', - '', '')), + None, None)), ('itms-services://?action=download-manifest&url=https://example.com/app', - ('itms-services', '', '', '', - 'action=download-manifest&url=https://example.com/app', ''), + ('itms-services', '', '', None, + 'action=download-manifest&url=https://example.com/app', None), ('itms-services', '', '', - 'action=download-manifest&url=https://example.com/app', '')), + 'action=download-manifest&url=https://example.com/app', None)), ('+scheme:path/to/file', - ('', '', '+scheme:path/to/file', '', '', ''), - ('', '', '+scheme:path/to/file', '', '')), + (None, None, '+scheme:path/to/file', None, None, None), + (None, None, '+scheme:path/to/file', None, None)), ('sch_me:path/to/file', - ('', '', 'sch_me:path/to/file', '', '', ''), - ('', '', 'sch_me:path/to/file', '', '')), + (None, None, 'sch_me:path/to/file', None, None, None), + (None, None, 'sch_me:path/to/file', None, None)), ] - def _encode(t): - return (t[0].encode('ascii'), - tuple(x.encode('ascii') for x in t[1]), - tuple(x.encode('ascii') for x in t[2])) - bytes_cases = [_encode(x) for x in str_cases] + bytes_cases = [self._encode(x) for x in str_cases] str_cases += [ ('schème:path/to/file', - ('', '', 'schème:path/to/file', '', '', ''), - ('', '', 'schème:path/to/file', '', '')), + (None, None, 'schème:path/to/file', None, None, None), + (None, None, 'schème:path/to/file', None, None)), ] for url, parsed, split in str_cases + bytes_cases: with self.subTest(url): - self.checkRoundtrips(url, parsed, split) + self.checkRoundtrips(url, parsed, split, allow_none=True) + empty = url[:0] + parsed = tuple(x or empty for x in parsed) + split = tuple(x or empty for x in split) + self.checkRoundtrips(url, parsed, split, allow_none=False) def test_roundtrips_normalization(self): str_cases = [ ('///path/to/file', - '/path/to/file', - ('', '', '/path/to/file', '', '', ''), - ('', '', '/path/to/file', '', '')), + '///path/to/file', + (None, '', '/path/to/file', None, None, None), + (None, '', '/path/to/file', None, None)), ('scheme:///path/to/file', - 'scheme:/path/to/file', - ('scheme', '', '/path/to/file', '', '', ''), - ('scheme', '', '/path/to/file', '', '')), + 'scheme:///path/to/file', + ('scheme', '', '/path/to/file', None, None, None), + ('scheme', '', '/path/to/file', None, None)), ('file:/tmp/junk.txt', - 'file:///tmp/junk.txt', - ('file', '', '/tmp/junk.txt', '', '', ''), - ('file', '', '/tmp/junk.txt', '', '')), + 'file:/tmp/junk.txt', + ('file', None, '/tmp/junk.txt', None, None, None), + ('file', None, '/tmp/junk.txt', None, None)), ('http:/tmp/junk.txt', - 'http:///tmp/junk.txt', - ('http', '', '/tmp/junk.txt', '', '', ''), - ('http', '', '/tmp/junk.txt', '', '')), + 'http:/tmp/junk.txt', + ('http', None, '/tmp/junk.txt', None, None, None), + ('http', None, '/tmp/junk.txt', None, None)), ('https:/tmp/junk.txt', - 'https:///tmp/junk.txt', - ('https', '', '/tmp/junk.txt', '', '', ''), - ('https', '', '/tmp/junk.txt', '', '')), + 'https:/tmp/junk.txt', + ('https', None, '/tmp/junk.txt', None, None, None), + ('https', None, '/tmp/junk.txt', None, None)), ] - def _encode(t): - return (t[0].encode('ascii'), - t[1].encode('ascii'), - tuple(x.encode('ascii') for x in t[2]), - tuple(x.encode('ascii') for x in t[3])) - bytes_cases = [_encode(x) for x in str_cases] + bytes_cases = [self._encode(x) for x in str_cases] for url, url2, parsed, split in str_cases + bytes_cases: with self.subTest(url): self.checkRoundtrips(url, parsed, split, url2) @@ -317,26 +328,22 @@ def test_http_roundtrips(self): # Three cheers for white box knowledge! str_cases = [ ('://www.python.org', - ('www.python.org', '', '', '', ''), - ('www.python.org', '', '', '')), + ('www.python.org', '', None, None, None), + ('www.python.org', '', None, None)), ('://www.python.org#abc', - ('www.python.org', '', '', '', 'abc'), - ('www.python.org', '', '', 'abc')), + ('www.python.org', '', None, None, 'abc'), + ('www.python.org', '', None, 'abc')), ('://www.python.org?q=abc', - ('www.python.org', '', '', 'q=abc', ''), - ('www.python.org', '', 'q=abc', '')), + ('www.python.org', '', None, 'q=abc', None), + ('www.python.org', '', 'q=abc', None)), ('://www.python.org/#abc', - ('www.python.org', '/', '', '', 'abc'), - ('www.python.org', '/', '', 'abc')), + ('www.python.org', '/', None, None, 'abc'), + ('www.python.org', '/', None, 'abc')), ('://a/b/c/d;p?q#f', ('a', '/b/c/d', 'p', 'q', 'f'), ('a', '/b/c/d;p', 'q', 'f')), ] - def _encode(t): - return (t[0].encode('ascii'), - tuple(x.encode('ascii') for x in t[1]), - tuple(x.encode('ascii') for x in t[2])) - bytes_cases = [_encode(x) for x in str_cases] + bytes_cases = [self._encode(x) for x in str_cases] str_schemes = ('http', 'https') bytes_schemes = (b'http', b'https') str_tests = str_schemes, str_cases @@ -347,25 +354,31 @@ def _encode(t): url = scheme + url parsed = (scheme,) + parsed split = (scheme,) + split - self.checkRoundtrips(url, parsed, split) + with self.subTest(url): + self.checkRoundtrips(url, parsed, split) def checkJoin(self, base, relurl, expected, *, relroundtrip=True): with self.subTest(base=base, relurl=relurl): self.assertEqual(urllib.parse.urljoin(base, relurl), expected) - baseb = base.encode('ascii') - relurlb = relurl.encode('ascii') - expectedb = expected.encode('ascii') + baseb = self._encode(base) + relurlb = self._encode(relurl) + expectedb = self._encode(expected) self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) if relroundtrip: - relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl)) - self.assertEqual(urllib.parse.urljoin(base, relurl), expected) - relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb)) - self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) + relurl2 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl)) + self.assertEqual(urllib.parse.urljoin(base, relurl2), expected) + relurlb2 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb)) + self.assertEqual(urllib.parse.urljoin(baseb, relurlb2), expectedb) + + relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True), keep_empty=True) + self.assertEqual(urllib.parse.urljoin(base, relurl3), expected) + relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True), keep_empty=True) + self.assertEqual(urllib.parse.urljoin(baseb, relurlb3), expectedb) def test_unparse_parse(self): str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',] - bytes_cases = [x.encode('ascii') for x in str_cases] + bytes_cases = [self._encode(x) for x in str_cases] for u in str_cases + bytes_cases: self.assertEqual(urllib.parse.urlunsplit(urllib.parse.urlsplit(u)), u) self.assertEqual(urllib.parse.urlunparse(urllib.parse.urlparse(u)), u) @@ -396,7 +409,7 @@ def test_RFC1808(self): self.checkJoin(RFC1808_BASE, '../../g', 'http://a/g') # "abnormal" cases from RFC 1808: - self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f') + self.checkJoin(RFC1808_BASE, None, 'http://a/b/c/d;p?q#f') self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.') self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g') self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..') @@ -422,6 +435,8 @@ def test_RFC2368(self): # Issue 11467: path that starts with a number is not parsed correctly self.assertEqual(urllib.parse.urlparse('mailto:1337@example.org'), ('mailto', '', '1337@example.org', '', '', '')) + self.assertEqual(urllib.parse.urlparse('mailto:1337@example.org', allow_none=True), + ('mailto', None, '1337@example.org', None, None, None)) def test_RFC2396(self): # cases from RFC 2396 @@ -656,9 +671,7 @@ def test_RFC2732(self): ('http://[::ffff:12.34.56.78]:/foo/', '::ffff:12.34.56.78', None), ] - def _encode(t): - return t[0].encode('ascii'), t[1].encode('ascii'), t[2] - bytes_cases = [_encode(x) for x in str_cases] + bytes_cases = [self._encode(x) for x in str_cases] for url, hostname, port in str_cases + bytes_cases: urlparsed = urllib.parse.urlparse(url) self.assertEqual((urlparsed.hostname, urlparsed.port) , (hostname, port)) @@ -669,25 +682,25 @@ def _encode(t): 'ftp://[::1/foo/bad]/bad', 'http://[::1/foo/bad]/bad', 'http://[::ffff:12.34.56.78'] - bytes_cases = [x.encode('ascii') for x in str_cases] + bytes_cases = [self._encode(x) for x in str_cases] for invalid_url in str_cases + bytes_cases: self.assertRaises(ValueError, urllib.parse.urlparse, invalid_url) def test_urldefrag(self): str_cases = [ ('http://python.org#frag', 'http://python.org', 'frag'), - ('http://python.org', 'http://python.org', ''), + ('http://python.org', 'http://python.org', None), ('http://python.org/#frag', 'http://python.org/', 'frag'), - ('http://python.org/', 'http://python.org/', ''), + ('http://python.org/', 'http://python.org/', None), ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'), - ('http://python.org/?q', 'http://python.org/?q', ''), + ('http://python.org/?q', 'http://python.org/?q', None), ('http://python.org/p#frag', 'http://python.org/p', 'frag'), - ('http://python.org/p?q', 'http://python.org/p?q', ''), + ('http://python.org/p?q', 'http://python.org/p?q', None), (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'), - (RFC2396_BASE, 'http://a/b/c/d;p?q', ''), + (RFC2396_BASE, 'http://a/b/c/d;p?q', None), ('http://a/b/c;p?q#f', 'http://a/b/c;p?q', 'f'), ('http://a/b/c;p?q#', 'http://a/b/c;p?q', ''), - ('http://a/b/c;p?q', 'http://a/b/c;p?q', ''), + ('http://a/b/c;p?q', 'http://a/b/c;p?q', None), ('http://a/b/c;p?#f', 'http://a/b/c;p?', 'f'), ('http://a/b/c;p#f', 'http://a/b/c;p', 'f'), ('http://a/b/c;?q#f', 'http://a/b/c;?q', 'f'), @@ -700,16 +713,21 @@ def test_urldefrag(self): ('://a/b/c;p?q#f', '://a/b/c;p?q', 'f'), ] def _encode(t): - return type(t)(x.encode('ascii') for x in t) + return type(t)(self._encode(t)) bytes_cases = [_encode(x) for x in str_cases] - for url, defrag, frag in str_cases + bytes_cases: - with self.subTest(url): - result = urllib.parse.urldefrag(url) - hash = '#' if isinstance(url, str) else b'#' - self.assertEqual(result.geturl(), url.rstrip(hash)) - self.assertEqual(result, (defrag, frag)) - self.assertEqual(result.url, defrag) - self.assertEqual(result.fragment, frag) + for allow_none in True, False: + for url, defrag, frag in str_cases + bytes_cases: + with self.subTest(url=url, allow_none=allow_none): + result = urllib.parse.urldefrag(url, allow_none=allow_none) + if not allow_none: + hash = '#' if isinstance(url, str) else b'#' + url = url.rstrip(hash) + if frag is None: + frag = url[:0] + self.assertEqual(result.geturl(keep_empty=allow_none), url) + self.assertEqual(result, (defrag, frag)) + self.assertEqual(result.url, defrag) + self.assertEqual(result.fragment, frag) def test_urlsplit_scoped_IPv6(self): p = urllib.parse.urlsplit('http://[FE80::822a:a8ff:fe49:470c%tESt]:1234') @@ -945,24 +963,27 @@ def test_attributes_bad_scheme(self): self.assertEqual(p.scheme, b"") else: self.assertEqual(p.scheme, "") + p = parse(url, allow_none=True) + self.assertIsNone(p.scheme) - def test_attributes_without_netloc(self): + @parametrise_allow_none + def test_attributes_without_netloc(self, allow_none): # This example is straight from RFC 3261. It looks like it # should allow the username, hostname, and port to be filled # in, but doesn't. Since it's a URI and doesn't use the # scheme://netloc syntax, the netloc and related attributes # should be left empty. uri = "sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15" - p = urllib.parse.urlsplit(uri) - self.assertEqual(p.netloc, "") + p = urllib.parse.urlsplit(uri, allow_none=allow_none) + self.assertEqual(p.netloc, None if allow_none else "") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, None) self.assertEqual(p.port, None) self.assertEqual(p.geturl(), uri) - p = urllib.parse.urlparse(uri) - self.assertEqual(p.netloc, "") + p = urllib.parse.urlparse(uri, allow_none=allow_none) + self.assertEqual(p.netloc, None if allow_none else "") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, None) @@ -971,16 +992,16 @@ def test_attributes_without_netloc(self): # You guessed it, repeating the test with bytes input uri = b"sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15" - p = urllib.parse.urlsplit(uri) - self.assertEqual(p.netloc, b"") + p = urllib.parse.urlsplit(uri, allow_none=allow_none) + self.assertEqual(p.netloc, None if allow_none else b"") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, None) self.assertEqual(p.port, None) self.assertEqual(p.geturl(), uri) - p = urllib.parse.urlparse(uri) - self.assertEqual(p.netloc, b"") + p = urllib.parse.urlparse(uri, allow_none=allow_none) + self.assertEqual(p.netloc, None if allow_none else b"") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, None) @@ -994,67 +1015,86 @@ def test_noslash(self): self.assertEqual(urllib.parse.urlparse(b"http://example.com?blahblah=/foo"), (b'http', b'example.com', b'', b'', b'blahblah=/foo', b'')) - def test_withoutscheme(self): + @parametrise_allow_none + def test_withoutscheme(self, allow_none): # Test urlparse without scheme # Issue 754016: urlparse goes wrong with IP:port without scheme # RFC 1808 specifies that netloc should start with //, urlparse expects # the same, otherwise it classifies the portion of url as path. - self.assertEqual(urllib.parse.urlparse("path"), - ('','','path','','','')) - self.assertEqual(urllib.parse.urlparse("//www.python.org:80"), - ('','www.python.org:80','','','','')) - self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"), - ('http','www.python.org:80','','','','')) + none = None if allow_none else '' + self.assertEqual(urllib.parse.urlparse("path", allow_none=allow_none), + (none, none, 'path', none, none, none)) + self.assertEqual(urllib.parse.urlparse("//www.python.org:80", allow_none=allow_none), + (none, 'www.python.org:80', '', none, none, none)) + self.assertEqual(urllib.parse.urlparse("http://www.python.org:80", allow_none=allow_none), + ('http', 'www.python.org:80', '', none, none, none)) # Repeat for bytes input - self.assertEqual(urllib.parse.urlparse(b"path"), - (b'',b'',b'path',b'',b'',b'')) - self.assertEqual(urllib.parse.urlparse(b"//www.python.org:80"), - (b'',b'www.python.org:80',b'',b'',b'',b'')) - self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80"), - (b'http',b'www.python.org:80',b'',b'',b'',b'')) - - def test_portseparator(self): + none = None if allow_none else b'' + self.assertEqual(urllib.parse.urlparse(b"path", allow_none=allow_none), + (none, none, b'path', none, none, none)) + self.assertEqual(urllib.parse.urlparse(b"//www.python.org:80", allow_none=allow_none), + (none, b'www.python.org:80', b'', none, none, none)) + self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80", allow_none=allow_none), + (b'http', b'www.python.org:80', b'', none, none, none)) + + @parametrise_allow_none + def test_portseparator(self, allow_none): # Issue 754016 makes changes for port separator ':' from scheme separator - self.assertEqual(urllib.parse.urlparse("http:80"), ('http','','80','','','')) - self.assertEqual(urllib.parse.urlparse("https:80"), ('https','','80','','','')) - self.assertEqual(urllib.parse.urlparse("path:80"), ('path','','80','','','')) - self.assertEqual(urllib.parse.urlparse("http:"),('http','','','','','')) - self.assertEqual(urllib.parse.urlparse("https:"),('https','','','','','')) - self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"), - ('http','www.python.org:80','','','','')) + none = None if allow_none else '' + self.assertEqual(urllib.parse.urlparse("http:80", allow_none=allow_none), + ('http', none, '80', none, none, none)) + self.assertEqual(urllib.parse.urlparse("https:80", allow_none=allow_none), + ('https', none, '80', none, none, none)) + self.assertEqual(urllib.parse.urlparse("path:80", allow_none=allow_none), + ('path', none, '80', none, none, none)) + self.assertEqual(urllib.parse.urlparse("http:", allow_none=allow_none), + ('http', none, '', none, none, none)) + self.assertEqual(urllib.parse.urlparse("https:", allow_none=allow_none), + ('https', none, '', none, none, none)) + self.assertEqual(urllib.parse.urlparse("http://www.python.org:80", allow_none=allow_none), + ('http', 'www.python.org:80', '', none, none, none)) # As usual, need to check bytes input as well - self.assertEqual(urllib.parse.urlparse(b"http:80"), (b'http',b'',b'80',b'',b'',b'')) - self.assertEqual(urllib.parse.urlparse(b"https:80"), (b'https',b'',b'80',b'',b'',b'')) - self.assertEqual(urllib.parse.urlparse(b"path:80"), (b'path',b'',b'80',b'',b'',b'')) - self.assertEqual(urllib.parse.urlparse(b"http:"),(b'http',b'',b'',b'',b'',b'')) - self.assertEqual(urllib.parse.urlparse(b"https:"),(b'https',b'',b'',b'',b'',b'')) - self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80"), - (b'http',b'www.python.org:80',b'',b'',b'',b'')) + none = None if allow_none else b'' + self.assertEqual(urllib.parse.urlparse(b"http:80", allow_none=allow_none), + (b'http', none, b'80', none, none, none)) + self.assertEqual(urllib.parse.urlparse(b"https:80", allow_none=allow_none), + (b'https', none, b'80', none, none, none)) + self.assertEqual(urllib.parse.urlparse(b"path:80", allow_none=allow_none), + (b'path', none, b'80', none, none, none)) + self.assertEqual(urllib.parse.urlparse(b"http:", allow_none=allow_none), + (b'http', none, b'', none, none, none)) + self.assertEqual(urllib.parse.urlparse(b"https:", allow_none=allow_none), + (b'https', none, b'', none, none, none)) + self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80", allow_none=allow_none), + (b'http', b'www.python.org:80', b'', none, none, none)) def test_usingsys(self): # Issue 3314: sys module is used in the error self.assertRaises(TypeError, urllib.parse.urlencode, "foo") - def test_anyscheme(self): + @parametrise_allow_none + def test_anyscheme(self, allow_none): # Issue 7904: s3://foo.com/stuff has netloc "foo.com". - self.assertEqual(urllib.parse.urlparse("s3://foo.com/stuff"), - ('s3', 'foo.com', '/stuff', '', '', '')) - self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff"), - ('x-newscheme', 'foo.com', '/stuff', '', '', '')) - self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff?query#fragment"), - ('x-newscheme', 'foo.com', '/stuff', '', 'query', 'fragment')) - self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff?query"), - ('x-newscheme', 'foo.com', '/stuff', '', 'query', '')) + none = None if allow_none else '' + self.assertEqual(urllib.parse.urlparse("s3://foo.com/stuff", allow_none=allow_none), + ('s3', 'foo.com', '/stuff', none, none, none)) + self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff", allow_none=allow_none), + ('x-newscheme', 'foo.com', '/stuff', none, none, none)) + self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff?query#fragment", allow_none=allow_none), + ('x-newscheme', 'foo.com', '/stuff', none, 'query', 'fragment')) + self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff?query", allow_none=allow_none), + ('x-newscheme', 'foo.com', '/stuff', none, 'query', none)) # And for bytes... - self.assertEqual(urllib.parse.urlparse(b"s3://foo.com/stuff"), - (b's3', b'foo.com', b'/stuff', b'', b'', b'')) - self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff"), - (b'x-newscheme', b'foo.com', b'/stuff', b'', b'', b'')) - self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff?query#fragment"), - (b'x-newscheme', b'foo.com', b'/stuff', b'', b'query', b'fragment')) - self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff?query"), - (b'x-newscheme', b'foo.com', b'/stuff', b'', b'query', b'')) + none = None if allow_none else b'' + self.assertEqual(urllib.parse.urlparse(b"s3://foo.com/stuff", allow_none=allow_none), + (b's3', b'foo.com', b'/stuff', none, none, none)) + self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff", allow_none=allow_none), + (b'x-newscheme', b'foo.com', b'/stuff', none, none, none)) + self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff?query#fragment", allow_none=allow_none), + (b'x-newscheme', b'foo.com', b'/stuff', none, b'query', b'fragment')) + self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff?query", allow_none=allow_none), + (b'x-newscheme', b'foo.com', b'/stuff', none, b'query', none)) def test_default_scheme(self): # Exercise the scheme parameter of urlparse() and urlsplit() @@ -1068,8 +1108,11 @@ def test_default_scheme(self): self.assertEqual(func("path", scheme="ftp").scheme, "ftp") self.assertEqual(func(b"path", scheme=b"ftp").scheme, b"ftp") self.assertEqual(func("path").scheme, "") + self.assertEqual(func("path", allow_none=True).scheme, None) self.assertEqual(func(b"path").scheme, b"") + self.assertEqual(func(b"path", allow_none=True).scheme, None) self.assertEqual(func(b"path", "").scheme, b"") + self.assertEqual(func(b"path", "", allow_none=True).scheme, b"") def test_parse_fragments(self): # Exercise the allow_fragments parameter of urlparse() and urlsplit() @@ -1096,6 +1139,12 @@ def test_parse_fragments(self): getattr(result, attr).endswith("#" + expected_frag)) self.assertEqual(func(url, "", False).fragment, "") + result = func(url, allow_fragments=False, allow_none=True) + self.assertIsNone(result.fragment) + self.assertTrue( + getattr(result, attr).endswith("#" + expected_frag)) + self.assertIsNone(func(url, "", False, allow_none=True).fragment) + result = func(url, allow_fragments=True) self.assertEqual(result.fragment, expected_frag) self.assertFalse( @@ -1368,6 +1417,11 @@ def test_telurl_params(self): self.assertEqual(p1.path, '+1-201-555-0123') self.assertEqual(p1.params, '') + p1 = urllib.parse.urlparse('tel:+1-201-555-0123', allow_none=True) + self.assertEqual(p1.scheme, 'tel') + self.assertEqual(p1.path, '+1-201-555-0123') + self.assertEqual(p1.params, None) + p1 = urllib.parse.urlparse('tel:7042;phone-context=example.com') self.assertEqual(p1.scheme, 'tel') self.assertEqual(p1.path, '7042') diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 5b00ab25c6b4ca..4c238cf7abff34 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -114,7 +114,8 @@ def _encode_result(obj, encoding=_implicit_encoding, def _decode_args(args, encoding=_implicit_encoding, errors=_implicit_errors): - return tuple(x.decode(encoding, errors) if x else '' for x in args) + return tuple(x.decode(encoding, errors) if x else '' if x is not None else x + for x in args) def _coerce_args(*args): # Invokes decode if necessary to create str args @@ -122,13 +123,20 @@ def _coerce_args(*args): # an appropriate result coercion function # - noop for str inputs # - encoding function otherwise - str_input = isinstance(args[0], str) - for arg in args[1:]: - # We special-case the empty string to support the - # "scheme=''" default argument to some functions - if arg and isinstance(arg, str) != str_input: - raise TypeError("Cannot mix str and non-str arguments") - if str_input: + str_input = None + for arg in args: + if arg: + if str_input is None: + str_input = isinstance(arg, str) + else: + if isinstance(arg, str) != str_input: + raise TypeError("Cannot mix str and non-str arguments") + if str_input is None: + for arg in args: + if arg is not None: + str_input = isinstance(arg, str) + break + if str_input is not False: return args + (_noop,) return _decode_args(args) + (_encode_result,) @@ -138,7 +146,9 @@ class _ResultMixinStr(object): __slots__ = () def encode(self, encoding='ascii', errors='strict'): - return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) + return self._encoded_counterpart(*(x.encode(encoding, errors) + if x is not None else None + for x in self)) class _ResultMixinBytes(object): @@ -146,7 +156,9 @@ class _ResultMixinBytes(object): __slots__ = () def decode(self, encoding='ascii', errors='strict'): - return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) + return self._decoded_counterpart(*(x.decode(encoding, errors) + if x is not None else None + for x in self)) class _NetlocResultMixinBase(object): @@ -193,6 +205,8 @@ class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): @property def _userinfo(self): netloc = self.netloc + if netloc is None: + return None, None userinfo, have_info, hostinfo = netloc.rpartition('@') if have_info: username, have_password, password = userinfo.partition(':') @@ -205,6 +219,8 @@ def _userinfo(self): @property def _hostinfo(self): netloc = self.netloc + if netloc is None: + return None, None _, _, hostinfo = netloc.rpartition('@') _, have_open_br, bracketed = hostinfo.partition('[') if have_open_br: @@ -223,6 +239,8 @@ class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): @property def _userinfo(self): netloc = self.netloc + if netloc is None: + return None, None userinfo, have_info, hostinfo = netloc.rpartition(b'@') if have_info: username, have_password, password = userinfo.partition(b':') @@ -235,6 +253,8 @@ def _userinfo(self): @property def _hostinfo(self): netloc = self.netloc + if netloc is None: + return None, None _, _, hostinfo = netloc.rpartition(b'@') _, have_open_br, bracketed = hostinfo.partition(b'[') if have_open_br: @@ -319,43 +339,45 @@ def _hostinfo(self): # retained since deprecating it isn't worth the hassle ResultBase = _NetlocResultMixinStr +_ALLOW_NONE_DEFAULT = False + # Structured result objects for string data class DefragResult(_DefragResultBase, _ResultMixinStr): __slots__ = () - def geturl(self): - if self.fragment: + def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): + if self.fragment or (keep_empty and self.fragment is not None): return self.url + '#' + self.fragment else: return self.url class SplitResult(_SplitResultBase, _NetlocResultMixinStr): __slots__ = () - def geturl(self): - return urlunsplit(self) + def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): + return urlunsplit(self, keep_empty=keep_empty) class ParseResult(_ParseResultBase, _NetlocResultMixinStr): __slots__ = () - def geturl(self): - return urlunparse(self) + def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): + return urlunparse(self, keep_empty=keep_empty) # Structured result objects for bytes data class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): __slots__ = () - def geturl(self): - if self.fragment: + def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): + if self.fragment or (keep_empty and self.fragment is not None): return self.url + b'#' + self.fragment else: return self.url class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): __slots__ = () - def geturl(self): - return urlunsplit(self) + def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): + return urlunsplit(self, keep_empty=keep_empty) class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): __slots__ = () - def geturl(self): - return urlunparse(self) + def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): + return urlunparse(self, keep_empty=keep_empty) # Set up the encode/decode result pairs def _fix_result_transcoding(): @@ -371,7 +393,7 @@ def _fix_result_transcoding(): _fix_result_transcoding() del _fix_result_transcoding -def urlparse(url, scheme='', allow_fragments=True): +def urlparse(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_DEFAULT): """Parse a URL into 6 components: :///;?# @@ -392,8 +414,16 @@ def urlparse(url, scheme='', allow_fragments=True): Note that % escapes are not expanded. """ url, scheme, _coerce_result = _coerce_args(url, scheme) + if url is None: + url = '' scheme, netloc, url, params, query, fragment = _urlparse(url, scheme, allow_fragments) - result = ParseResult(scheme or '', netloc or '', url, params or '', query or '', fragment or '') + if not allow_none: + if scheme is None: scheme = '' + if netloc is None: netloc = '' + if params is None: params = '' + if query is None: query = '' + if fragment is None: fragment = '' + result = ParseResult(scheme, netloc, url, params, query, fragment) return _coerce_result(result) def _urlparse(url, scheme=None, allow_fragments=True): @@ -453,7 +483,7 @@ def _check_bracketed_host(hostname): # typed=True avoids BytesWarnings being emitted during cache key # comparison since this API supports both bytes and str input. @functools.lru_cache(typed=True) -def urlsplit(url, scheme='', allow_fragments=True): +def urlsplit(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_DEFAULT): """Parse a URL into 5 components: :///?# @@ -475,8 +505,15 @@ def urlsplit(url, scheme='', allow_fragments=True): """ url, scheme, _coerce_result = _coerce_args(url, scheme) + if url is None: + url = '' scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments) - v = SplitResult(scheme or '', netloc or '', url, query or '', fragment or '') + if not allow_none: + if scheme is None: scheme = '' + if netloc is None: netloc = '' + if query is None: query = '' + if fragment is None: fragment = '' + v = SplitResult(scheme, netloc, url, query, fragment) return _coerce_result(v) def _urlsplit(url, scheme=None, allow_fragments=True): @@ -514,24 +551,28 @@ def _urlsplit(url, scheme=None, allow_fragments=True): _checknetloc(netloc) return (scheme, netloc, url, query, fragment) -def urlunparse(components): +def urlunparse(components, *, keep_empty=_ALLOW_NONE_DEFAULT): """Put a parsed URL back together again. This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had redundant delimiters, e.g. a ? with an empty query (the draft states that these are equivalent).""" scheme, netloc, url, params, query, fragment, _coerce_result = ( _coerce_args(*components)) - if not netloc: - if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): - netloc = '' - else: - netloc = None - if params: + if not keep_empty: + if not netloc: + if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): + netloc = '' + else: + netloc = None + if not scheme: scheme = None + if not params: params = None + if not query: query = None + if not fragment: fragment = None + if params is not None: url = "%s;%s" % (url, params) - return _coerce_result(_urlunsplit(scheme or None, netloc, url, - query or None, fragment or None)) + return _coerce_result(_urlunsplit(scheme, netloc, url, query, fragment)) -def urlunsplit(components): +def urlunsplit(components, *, keep_empty=_ALLOW_NONE_DEFAULT): """Combine the elements of a tuple as returned by urlsplit() into a complete URL as a string. The data argument can be any five-item iterable. This may result in a slightly different, but equivalent URL, if the URL that @@ -539,13 +580,16 @@ def urlunsplit(components): empty query; the RFC states that these are equivalent).""" scheme, netloc, url, query, fragment, _coerce_result = ( _coerce_args(*components)) - if not netloc: - if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): - netloc = '' - else: - netloc = None - return _coerce_result(_urlunsplit(scheme or None, netloc, url, - query or None, fragment or None)) + if not keep_empty: + if not netloc: + if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): + netloc = '' + else: + netloc = None + if not scheme: scheme = None + if not query: query = None + if not fragment: fragment = None + return _coerce_result(_urlunsplit(scheme, netloc, url, query, fragment)) def _urlunsplit(scheme, netloc, url, query, fragment): if netloc is not None: @@ -633,7 +677,7 @@ def urljoin(base, url, allow_fragments=True): resolved_path) or '/', query, fragment)) -def urldefrag(url): +def urldefrag(url, *, allow_none=_ALLOW_NONE_DEFAULT): """Removes any existing fragment from URL. Returns a tuple of the defragmented URL and the fragment. If @@ -645,9 +689,10 @@ def urldefrag(url): s, n, p, q, frag = _urlsplit(url) defrag = _urlunsplit(s, n, p, q, None) else: - frag = '' + frag = None defrag = url - return _coerce_result(DefragResult(defrag, frag or '')) + if not allow_none and frag is None: frag = '' + return _coerce_result(DefragResult(defrag, frag)) _hexdig = '0123456789ABCDEFabcdef' _hextobyte = None From a1dbfa6eb8a131ec1d5a222bd870e85b46177aed Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 31 Aug 2024 16:20:52 +0300 Subject: [PATCH 2/7] Document the things. --- Doc/library/urllib.parse.rst | 217 ++++++++++++++++++++++------------- Doc/whatsnew/3.14.rst | 11 ++ Lib/urllib/parse.py | 2 +- 3 files changed, 149 insertions(+), 81 deletions(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index fb5353e1895bf9..e08e8b20aadb8f 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -50,12 +50,15 @@ URL Parsing The URL parsing functions focus on splitting a URL string into its components, or on combining URL components into a URL string. -.. function:: urlparse(urlstring, scheme='', allow_fragments=True) +.. function:: urlparse(urlstring, scheme=None, allow_fragments=True, *, allow_none=False) Parse a URL into six components, returning a 6-item :term:`named tuple`. This corresponds to the general structure of a URL: ``scheme://netloc/path;parameters?query#fragment``. - Each tuple item is a string, possibly empty. The components are not broken up + Each tuple item is a string, possibly empty, or ``None`` if *allow_none* is true. + Not defined component are represented an empty string (by default) or + ``None`` if *allow_none* is true. + The components are not broken up into smaller parts (for example, the network location is a single string), and % escapes are not expanded. The delimiters as shown above are not part of the result, except for a leading slash in the *path* component, which is retained if @@ -84,6 +87,12 @@ or on combining URL components into a URL string. 80 >>> o._replace(fragment="").geturl() 'http://docs.python.org:80/3/library/urllib.parse.html?highlight=params' + >>> urlparse("http://docs.python.org?") + ParseResult(scheme='http', netloc='docs.python.org', + path='', params='', query='', fragment='') + >>> urlparse("http://docs.python.org?", allow_none=True) + ParseResult(scheme='http', netloc='docs.python.org', + path='', params=None, query='', fragment=None) Following the syntax specifications in :rfc:`1808`, urlparse recognizes a netloc only if it is properly introduced by '//'. Otherwise the @@ -101,47 +110,53 @@ or on combining URL components into a URL string. ParseResult(scheme='', netloc='', path='www.cwi.nl/%7Eguido/Python.html', params='', query='', fragment='') >>> urlparse('help/Python.html') - ParseResult(scheme='', netloc='', path='help/Python.html', params='', - query='', fragment='') + ParseResult(scheme='', netloc='', path='help/Python.html', + params='', query='', fragment='') + >>> urlparse('help/Python.html', allow_none=True) + ParseResult(scheme=None, netloc=None, path='help/Python.html', + params=None, query=None, fragment=None) The *scheme* argument gives the default addressing scheme, to be used only if the URL does not specify one. It should be the same type - (text or bytes) as *urlstring*, except that the default value ``''`` is + (text or bytes) as *urlstring* or ``None``, except that the ``''`` is always allowed, and is automatically converted to ``b''`` if appropriate. If the *allow_fragments* argument is false, fragment identifiers are not recognized. Instead, they are parsed as part of the path, parameters - or query component, and :attr:`fragment` is set to the empty string in - the return value. + or query component, and :attr:`fragment` is set to ``None`` or the empty + string (depending on the value of *allow_none*) in the return value. The return value is a :term:`named tuple`, which means that its items can be accessed by index or as named attributes, which are: - +------------------+-------+-------------------------+------------------------+ - | Attribute | Index | Value | Value if not present | - +==================+=======+=========================+========================+ - | :attr:`scheme` | 0 | URL scheme specifier | *scheme* parameter | - +------------------+-------+-------------------------+------------------------+ - | :attr:`netloc` | 1 | Network location part | empty string | - +------------------+-------+-------------------------+------------------------+ - | :attr:`path` | 2 | Hierarchical path | empty string | - +------------------+-------+-------------------------+------------------------+ - | :attr:`params` | 3 | Parameters for last | empty string | - | | | path element | | - +------------------+-------+-------------------------+------------------------+ - | :attr:`query` | 4 | Query component | empty string | - +------------------+-------+-------------------------+------------------------+ - | :attr:`fragment` | 5 | Fragment identifier | empty string | - +------------------+-------+-------------------------+------------------------+ - | :attr:`username` | | User name | :const:`None` | - +------------------+-------+-------------------------+------------------------+ - | :attr:`password` | | Password | :const:`None` | - +------------------+-------+-------------------------+------------------------+ - | :attr:`hostname` | | Host name (lower case) | :const:`None` | - +------------------+-------+-------------------------+------------------------+ - | :attr:`port` | | Port number as integer, | :const:`None` | - | | | if present | | - +------------------+-------+-------------------------+------------------------+ + +------------------+-------+-------------------------+-------------------------------+ + | Attribute | Index | Value | Value if not present | + +==================+=======+=========================+===============================+ + | :attr:`scheme` | 0 | URL scheme specifier | *scheme* parameter or | + | | | | empty string [1]_ | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`netloc` | 1 | Network location part | ``None`` or empty string [1]_ | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`path` | 2 | Hierarchical path | empty string | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`params` | 3 | Parameters for last | ``None`` or empty string [1]_ | + | | | path element | | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`query` | 4 | Query component | ``None`` or empty string [1]_ | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`fragment` | 5 | Fragment identifier | ``None`` or empty string [1]_ | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`username` | | User name | ``None`` | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`password` | | Password | ``None`` | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`hostname` | | Host name (lower case) | ``None`` | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`port` | | Port number as integer, | ``None`` | + | | | if present | | + +------------------+-------+-------------------------+-------------------------------+ + + .. [1] Depending on the value of the *allow_none* argument. Reading the :attr:`port` attribute will raise a :exc:`ValueError` if an invalid port is specified in the URL. See section @@ -187,12 +202,15 @@ or on combining URL components into a URL string. .. versionchanged:: 3.6 Out-of-range port numbers now raise :exc:`ValueError`, instead of - returning :const:`None`. + returning ``None``. .. versionchanged:: 3.8 Characters that affect netloc parsing under NFKC normalization will now raise :exc:`ValueError`. + .. versionchanged:: 3.14 + Added the *allow_none* parameter. + .. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None, separator='&') @@ -283,16 +301,25 @@ or on combining URL components into a URL string. separator key, with ``&`` as the default separator. -.. function:: urlunparse(parts) +.. function:: urlunparse(parts, *, keep_empty=False) Construct a URL from a tuple as returned by ``urlparse()``. The *parts* - argument can be any six-item iterable. This may result in a slightly - different, but equivalent URL, if the URL that was parsed originally had - unnecessary delimiters (for example, a ``?`` with an empty query; the RFC - states that these are equivalent). + argument can be any six-item iterable. + + This may result in a slightly different, but equivalent URL, if the + URL that was parsed originally had unnecessary delimiters (for example, + a ``?`` with an empty query; the RFC states that these are equivalent). + + If *keep_empty* is true, empty strings are kept in the result (for example, + a ``?`` for an empty query), only ``None`` components are omitted. + This allows to restore the URL that was parsed with option + ``allow_none=True``. + .. versionchanged:: 3.14 + Added the *keep_empty* parameter. -.. function:: urlsplit(urlstring, scheme='', allow_fragments=True) + +.. function:: urlsplit(urlstring, scheme=None, allow_fragments=True, *, allow_none=False) This is similar to :func:`urlparse`, but does not split the params from the URL. This should generally be used instead of :func:`urlparse` if the more recent URL @@ -306,28 +333,31 @@ or on combining URL components into a URL string. The return value is a :term:`named tuple`, its items can be accessed by index or as named attributes: - +------------------+-------+-------------------------+----------------------+ - | Attribute | Index | Value | Value if not present | - +==================+=======+=========================+======================+ - | :attr:`scheme` | 0 | URL scheme specifier | *scheme* parameter | - +------------------+-------+-------------------------+----------------------+ - | :attr:`netloc` | 1 | Network location part | empty string | - +------------------+-------+-------------------------+----------------------+ - | :attr:`path` | 2 | Hierarchical path | empty string | - +------------------+-------+-------------------------+----------------------+ - | :attr:`query` | 3 | Query component | empty string | - +------------------+-------+-------------------------+----------------------+ - | :attr:`fragment` | 4 | Fragment identifier | empty string | - +------------------+-------+-------------------------+----------------------+ - | :attr:`username` | | User name | :const:`None` | - +------------------+-------+-------------------------+----------------------+ - | :attr:`password` | | Password | :const:`None` | - +------------------+-------+-------------------------+----------------------+ - | :attr:`hostname` | | Host name (lower case) | :const:`None` | - +------------------+-------+-------------------------+----------------------+ - | :attr:`port` | | Port number as integer, | :const:`None` | - | | | if present | | - +------------------+-------+-------------------------+----------------------+ + +------------------+-------+-------------------------+-------------------------------+ + | Attribute | Index | Value | Value if not present | + +==================+=======+=========================+===============================+ + | :attr:`scheme` | 0 | URL scheme specifier | *scheme* parameter or | + | | | | empty string [1]_ | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`netloc` | 1 | Network location part | ``None`` or empty string [2]_ | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`path` | 2 | Hierarchical path | empty string | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`query` | 3 | Query component | ``None`` or empty string [2]_ | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`fragment` | 4 | Fragment identifier | ``None`` or empty string [2]_ | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`username` | | User name | ``None`` | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`password` | | Password | ``None`` | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`hostname` | | Host name (lower case) | ``None`` | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`port` | | Port number as integer, | ``None`` | + | | | if present | | + +------------------+-------+-------------------------+-------------------------------+ + + .. [2] Depending on the value of the *allow_none* argument. Reading the :attr:`port` attribute will raise a :exc:`ValueError` if an invalid port is specified in the URL. See section @@ -352,7 +382,7 @@ or on combining URL components into a URL string. .. versionchanged:: 3.6 Out-of-range port numbers now raise :exc:`ValueError`, instead of - returning :const:`None`. + returning ``None``. .. versionchanged:: 3.8 Characters that affect netloc parsing under NFKC normalization will @@ -364,15 +394,28 @@ or on combining URL components into a URL string. .. versionchanged:: 3.12 Leading WHATWG C0 control and space characters are stripped from the URL. + .. versionchanged:: 3.14 + Added the *allow_none* parameter. + .. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser -.. function:: urlunsplit(parts) +.. function:: urlunsplit(parts, *, keep_empty=False) Combine the elements of a tuple as returned by :func:`urlsplit` into a complete URL as a string. The *parts* argument can be any five-item - iterable. This may result in a slightly different, but equivalent URL, if the - URL that was parsed originally had unnecessary delimiters (for example, a ? - with an empty query; the RFC states that these are equivalent). + iterable. + + This may result in a slightly different, but equivalent URL, if the + URL that was parsed originally had unnecessary delimiters (for example, + a ``?`` with an empty query; the RFC states that these are equivalent). + + If *keep_empty* is true, empty strings are kept in the result (for example, + a ``?`` for an empty query), only ``None`` components are omitted. + This allows to restore the URL that was parsed with option + ``allow_none=True``. + + .. versionchanged:: 3.14 + Added the *keep_empty* parameter. .. function:: urljoin(base, url, allow_fragments=True) @@ -405,27 +448,31 @@ or on combining URL components into a URL string. .. versionchanged:: 3.5 - Behavior updated to match the semantics defined in :rfc:`3986`. + .. versionchanged:: 3.14 + Added the *keep_empty* parameter. -.. function:: urldefrag(url) + +.. function:: urldefrag(url, *, allow_none=False) If *url* contains a fragment identifier, return a modified version of *url* with no fragment identifier, and the fragment identifier as a separate string. If there is no fragment identifier in *url*, return *url* unmodified - and an empty string. + and an empty string (by default) or ``None`` if *allow_none* is true. The return value is a :term:`named tuple`, its items can be accessed by index or as named attributes: - +------------------+-------+-------------------------+----------------------+ - | Attribute | Index | Value | Value if not present | - +==================+=======+=========================+======================+ - | :attr:`url` | 0 | URL with no fragment | empty string | - +------------------+-------+-------------------------+----------------------+ - | :attr:`fragment` | 1 | Fragment identifier | empty string | - +------------------+-------+-------------------------+----------------------+ + +------------------+-------+-------------------------+-------------------------------+ + | Attribute | Index | Value | Value if not present | + +==================+=======+=========================+===============================+ + | :attr:`url` | 0 | URL with no fragment | empty string | + +------------------+-------+-------------------------+-------------------------------+ + | :attr:`fragment` | 1 | Fragment identifier | ``None`` or empty string [3]_ | + +------------------+-------+-------------------------+-------------------------------+ + + .. [3] Depending on the value of the *allow_none* argument. See section :ref:`urlparse-result-object` for more information on the result object. @@ -433,6 +480,9 @@ or on combining URL components into a URL string. .. versionchanged:: 3.2 Result is a structured object rather than a simple 2-tuple. + .. versionchanged:: 3.14 + Added the *allow_none* parameter. + .. function:: unwrap(url) Extract the url from a wrapped URL (that is, a string formatted as @@ -452,8 +502,9 @@ URLs elsewhere. Their purpose is for practical functionality rather than purity. Instead of raising an exception on unusual input, they may instead return some -component parts as empty strings. Or components may contain more than perhaps -they should. +component parts as empty strings or ``None`` (depending on the value of the +*allow_none* argument). +Or components may contain more than perhaps they should. We recommend that users of these APIs where the values may be used anywhere with security implications code defensively. Do some verification within your @@ -524,12 +575,12 @@ These subclasses add the attributes listed in the documentation for those functions, the encoding and decoding support described in the previous section, as well as an additional method: -.. method:: urllib.parse.SplitResult.geturl() +.. method:: urllib.parse.SplitResult.geturl(*, keep_empty=False) Return the re-combined version of the original URL as a string. This may differ from the original URL in that the scheme may be normalized to lower case and empty components may be dropped. Specifically, empty parameters, - queries, and fragment identifiers will be removed. + queries, and fragment identifiers will be removed unless *keep_empty* is true. For :func:`urldefrag` results, only empty fragment identifiers will be removed. For :func:`urlsplit` and :func:`urlparse` results, all noted changes will be @@ -546,6 +597,12 @@ previous section, as well as an additional method: >>> r2 = urlsplit(r1.geturl()) >>> r2.geturl() 'http://www.Python.org/doc/' + >>> r3 = urlsplit(url, allow_none=True) + >>> r1.geturl(keep_empty=True) + 'http://www.Python.org/doc/' + + .. versionchanged:: 3.14 + Added the *keep_empty* parameter. The following classes provide the implementations of the structured parse diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index bbff8ecdd12117..e0fa815132ec1d 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -225,6 +225,17 @@ symtable (Contributed by Bénédikt Tran in :gh:`120029`.) + +urllib.parse +------------ + +Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`, +:func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag` functions. +Add the *keep_empty* parameter to :func:`~urllib.parse.urlunparse` and +:func:`~urllib.parse.urlunsplit` functions and +:func:`~urllib.parse.SplitResult.geturl` methods. +(Contributed by Serhiy Storchaka in :gh:`67041`.) + .. Add improved modules above alphabetically, not here at the end. Optimizations diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 4c238cf7abff34..356b25058313e4 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -682,7 +682,7 @@ def urldefrag(url, *, allow_none=_ALLOW_NONE_DEFAULT): Returns a tuple of the defragmented URL and the fragment. If the URL contained no fragments, the second element is the - empty string. + empty string or None if allow_none is True. """ url, _coerce_result = _coerce_args(url) if '#' in url: From eaa9ce6564ff0d1025991d872f9d88a85ed8c726 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 27 Nov 2024 13:11:40 +0200 Subject: [PATCH 3/7] Preserve the status of allow_none in results. --- Doc/library/urllib.parse.rst | 20 ++--- Doc/whatsnew/3.14.rst | 5 +- Lib/test/test_urlparse.py | 81 ++++++++++++------ Lib/urllib/parse.py | 85 ++++++++++++------- ...4-11-27-13-11-16.gh-issue-67041.ym2WKK.rst | 6 ++ 5 files changed, 124 insertions(+), 73 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index ae2ff518b5ba7e..1b66531ccc768b 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -318,6 +318,8 @@ or on combining URL components into a URL string. a ``?`` for an empty query), only ``None`` components are omitted. This allows to restore the URL that was parsed with option ``allow_none=True``. + By default, *keep_empty* is true if *parts* is the result of the + :func:`urlparse` call with ``allow_none=True``. .. versionchanged:: 3.14 Added the *keep_empty* parameter. @@ -417,6 +419,8 @@ or on combining URL components into a URL string. a ``?`` for an empty query), only ``None`` components are omitted. This allows to restore the URL that was parsed with option ``allow_none=True``. + By default, *keep_empty* is true if *parts* is the result of the + :func:`urlsplit` call with ``allow_none=True``. .. versionchanged:: 3.14 Added the *keep_empty* parameter. @@ -461,10 +465,8 @@ or on combining URL components into a URL string. .. versionchanged:: 3.5 - Behavior updated to match the semantics defined in :rfc:`3986`. - .. versionchanged:: 3.14 - Added the *keep_empty* parameter. + Behavior updated to match the semantics defined in :rfc:`3986`. .. function:: urldefrag(url, *, allow_none=False) @@ -588,12 +590,13 @@ These subclasses add the attributes listed in the documentation for those functions, the encoding and decoding support described in the previous section, as well as an additional method: -.. method:: urllib.parse.SplitResult.geturl(*, keep_empty=False) +.. method:: urllib.parse.SplitResult.geturl() Return the re-combined version of the original URL as a string. This may differ from the original URL in that the scheme may be normalized to lower case and empty components may be dropped. Specifically, empty parameters, - queries, and fragment identifiers will be removed unless *keep_empty* is true. + queries, and fragment identifiers will be removed unless the URL was parsed + with ``allow_none=True``. For :func:`urldefrag` results, only empty fragment identifiers will be removed. For :func:`urlsplit` and :func:`urlparse` results, all noted changes will be @@ -611,11 +614,8 @@ previous section, as well as an additional method: >>> r2.geturl() 'http://www.Python.org/doc/' >>> r3 = urlsplit(url, allow_none=True) - >>> r1.geturl(keep_empty=True) - 'http://www.Python.org/doc/' - - .. versionchanged:: 3.14 - Added the *keep_empty* parameter. + >>> r3.geturl() + 'http://www.Python.org/doc/#' The following classes provide the implementations of the structured parse diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index c4c48492d92741..8ca9f43d7ae891 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -595,8 +595,9 @@ urllib.parse * Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`, :func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag` functions. Add the *keep_empty* parameter to :func:`~urllib.parse.urlunparse` and - :func:`~urllib.parse.urlunsplit` functions and - :func:`~urllib.parse.SplitResult.geturl` methods. + :func:`~urllib.parse.urlunsplit` functions. + This allows to distinguish between empty and not defined URI components + and preserve empty components. (Contributed by Serhiy Storchaka in :gh:`67041`.) uuid diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index a2504365bc0c0c..f9c583710e43a7 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -3,6 +3,7 @@ import unicodedata import unittest import urllib.parse +from urllib.parse import urlparse, urlsplit, urlunparse, urlunsplit RFC1808_BASE = "http://a/b/c/d;p?q#f" RFC2396_BASE = "http://a/b/c/d;p?q" @@ -119,23 +120,50 @@ def _encode(self, s): return tuple(self._encode(x) for x in s) return s - def checkRoundtrips(self, url, parsed, split, url2=None, *, allow_none=True): + def checkRoundtrips(self, url, parsed, split, url2=None): if url2 is None: url2 = url - result = urllib.parse.urlparse(url, allow_none=allow_none) + self.checkRoundtrips1(url, parsed, split, allow_none=True) + empty = url[:0] + parsed = tuple(x or empty for x in parsed) + split = tuple(x or empty for x in split) + self.checkRoundtrips1(url, parsed, split, url2, allow_none=False) + + result = urlparse(url, allow_none=True) + self.assertEqual(urlunparse(result, keep_empty=False), url2) + self.assertEqual(urlunparse(tuple(result), keep_empty=False), url2) + result = urlparse(url, allow_none=False) + with self.assertRaises(ValueError): + urlunparse(result, keep_empty=True) + urlunparse(tuple(result), keep_empty=True) + + result = urlsplit(url, allow_none=True) + self.assertEqual(urlunsplit(result, keep_empty=False), url2) + self.assertEqual(urlunsplit(tuple(result), keep_empty=False), url2) + result = urlsplit(url, allow_none=False) + with self.assertRaises(ValueError): + urlunsplit(result, keep_empty=True) + urlunsplit(tuple(result), keep_empty=True) + + def checkRoundtrips1(self, url, parsed, split, url2=None, *, allow_none): + if url2 is None: + url2 = url + result = urlparse(url, allow_none=allow_none) self.assertSequenceEqual(result, parsed) t = (result.scheme, result.netloc, result.path, - result.params, result.query, result.fragment) + result.params, result.query, result.fragment) self.assertSequenceEqual(t, parsed) # put it back together and it should be the same - result2 = urllib.parse.urlunparse(result, keep_empty=allow_none) - self.assertSequenceEqual(result2, url2) - self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none)) + result2 = urlunparse(result) + self.assertEqual(result2, url2) + self.assertEqual(result2, result.geturl()) + self.assertEqual(urlunparse(result, keep_empty=allow_none), url2) + self.assertEqual(urlunparse(tuple(result), keep_empty=allow_none), result2) # the result of geturl() is a fixpoint; we can always parse it # again to get the same result: - result3 = urllib.parse.urlparse(result.geturl(keep_empty=allow_none), allow_none=allow_none) - self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none)) + result3 = urlparse(result.geturl(), allow_none=allow_none) + self.assertEqual(result3.geturl(), result.geturl()) self.assertSequenceEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) self.assertEqual(result3.netloc, result.netloc) @@ -149,18 +177,19 @@ def checkRoundtrips(self, url, parsed, split, url2=None, *, allow_none=True): self.assertEqual(result3.port, result.port) # check the roundtrip using urlsplit() as well - result = urllib.parse.urlsplit(url, allow_none=allow_none) + result = urlsplit(url, allow_none=allow_none) self.assertSequenceEqual(result, split) t = (result.scheme, result.netloc, result.path, - result.query, result.fragment) + result.query, result.fragment) self.assertSequenceEqual(t, split) - result2 = urllib.parse.urlunsplit(result, keep_empty=allow_none) - self.assertSequenceEqual(result2, url2) - self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none)) + result2 = urlunsplit(result) + self.assertEqual(result2, url2) + self.assertEqual(result2, result.geturl()) + self.assertEqual(urlunsplit(tuple(result), keep_empty=allow_none), result2) # check the fixpoint property of re-parsing the result of geturl() - result3 = urllib.parse.urlsplit(result.geturl(keep_empty=allow_none), allow_none=allow_none) - self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none)) + result3 = urlsplit(result.geturl(), allow_none=allow_none) + self.assertEqual(result3.geturl(), result.geturl()) self.assertSequenceEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) self.assertEqual(result3.netloc, result.netloc) @@ -288,32 +317,28 @@ def test_roundtrips(self): ] for url, parsed, split in str_cases + bytes_cases: with self.subTest(url): - self.checkRoundtrips(url, parsed, split, allow_none=True) - empty = url[:0] - parsed = tuple(x or empty for x in parsed) - split = tuple(x or empty for x in split) - self.checkRoundtrips(url, parsed, split, allow_none=False) + self.checkRoundtrips(url, parsed, split) def test_roundtrips_normalization(self): str_cases = [ ('///path/to/file', - '///path/to/file', + '/path/to/file', (None, '', '/path/to/file', None, None, None), (None, '', '/path/to/file', None, None)), ('scheme:///path/to/file', - 'scheme:///path/to/file', + 'scheme:/path/to/file', ('scheme', '', '/path/to/file', None, None, None), ('scheme', '', '/path/to/file', None, None)), ('file:/tmp/junk.txt', - 'file:/tmp/junk.txt', + 'file:///tmp/junk.txt', ('file', None, '/tmp/junk.txt', None, None, None), ('file', None, '/tmp/junk.txt', None, None)), ('http:/tmp/junk.txt', - 'http:/tmp/junk.txt', + 'http:///tmp/junk.txt', ('http', None, '/tmp/junk.txt', None, None, None), ('http', None, '/tmp/junk.txt', None, None)), ('https:/tmp/junk.txt', - 'https:/tmp/junk.txt', + 'https:///tmp/junk.txt', ('https', None, '/tmp/junk.txt', None, None, None), ('https', None, '/tmp/junk.txt', None, None)), ] @@ -371,9 +396,9 @@ def checkJoin(self, base, relurl, expected, *, relroundtrip=True): relurlb2 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb)) self.assertEqual(urllib.parse.urljoin(baseb, relurlb2), expectedb) - relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True), keep_empty=True) + relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True)) self.assertEqual(urllib.parse.urljoin(base, relurl3), expected) - relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True), keep_empty=True) + relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True)) self.assertEqual(urllib.parse.urljoin(baseb, relurlb3), expectedb) def test_unparse_parse(self): @@ -796,7 +821,7 @@ def _encode(t): url = url.rstrip(hash) if frag is None: frag = url[:0] - self.assertEqual(result.geturl(keep_empty=allow_none), url) + self.assertEqual(result.geturl(), url) self.assertEqual(result, (defrag, frag)) self.assertEqual(result.url, defrag) self.assertEqual(result.fragment, frag) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 72c39886d6f065..8d2a05bd134135 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -267,11 +267,27 @@ def _hostinfo(self): return hostname, port -_DefragResultBase = namedtuple('_DefragResultBase', 'url fragment') -_SplitResultBase = namedtuple( - '_SplitResultBase', 'scheme netloc path query fragment') -_ParseResultBase = namedtuple( - '_ParseResultBase', 'scheme netloc path params query fragment') +_UNSPECIFIED = ['not specified'] +_ALLOW_NONE_DEFAULT = False + +class _DefragResultBase(namedtuple('_DefragResultBase', 'url fragment')): + def geturl(self): + if self.fragment or (self.fragment is not None and + getattr(self, '_keep_empty', _ALLOW_NONE_DEFAULT)): + return self.url + self._HASH + self.fragment + else: + return self.url + +class _SplitResultBase(namedtuple( + '_SplitResultBase', 'scheme netloc path query fragment')): + def geturl(self): + return urlunsplit(self) + +class _ParseResultBase(namedtuple( + '_ParseResultBase', 'scheme netloc path params query fragment')): + def geturl(self): + return urlunparse(self) + _DefragResultBase.__doc__ = """ DefragResult(url, fragment) @@ -339,45 +355,27 @@ def _hostinfo(self): # retained since deprecating it isn't worth the hassle ResultBase = _NetlocResultMixinStr -_ALLOW_NONE_DEFAULT = False - # Structured result objects for string data class DefragResult(_DefragResultBase, _ResultMixinStr): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - if self.fragment or (keep_empty and self.fragment is not None): - return self.url + '#' + self.fragment - else: - return self.url + _HASH = '#' class SplitResult(_SplitResultBase, _NetlocResultMixinStr): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - return urlunsplit(self, keep_empty=keep_empty) class ParseResult(_ParseResultBase, _NetlocResultMixinStr): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - return urlunparse(self, keep_empty=keep_empty) # Structured result objects for bytes data class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - if self.fragment or (keep_empty and self.fragment is not None): - return self.url + b'#' + self.fragment - else: - return self.url + _HASH = b'#' class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - return urlunsplit(self, keep_empty=keep_empty) class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): __slots__ = () - def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT): - return urlunparse(self, keep_empty=keep_empty) # Set up the encode/decode result pairs def _fix_result_transcoding(): @@ -424,7 +422,9 @@ def urlparse(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D if query is None: query = '' if fragment is None: fragment = '' result = ParseResult(scheme, netloc, url, params, query, fragment) - return _coerce_result(result) + result = _coerce_result(result) + result._keep_empty = allow_none + return result def _urlparse(url, scheme=None, allow_fragments=True): scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments) @@ -513,8 +513,10 @@ def urlsplit(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D if netloc is None: netloc = '' if query is None: query = '' if fragment is None: fragment = '' - v = SplitResult(scheme, netloc, url, query, fragment) - return _coerce_result(v) + result = SplitResult(scheme, netloc, url, query, fragment) + result = _coerce_result(result) + result._keep_empty = allow_none + return result def _urlsplit(url, scheme=None, allow_fragments=True): # Only lstrip url as some applications rely on preserving trailing space. @@ -551,13 +553,20 @@ def _urlsplit(url, scheme=None, allow_fragments=True): _checknetloc(netloc) return (scheme, netloc, url, query, fragment) -def urlunparse(components, *, keep_empty=_ALLOW_NONE_DEFAULT): +def urlunparse(components, *, keep_empty=_UNSPECIFIED): """Put a parsed URL back together again. This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had redundant delimiters, e.g. a ? with an empty query - (the draft states that these are equivalent).""" + (the draft states that these are equivalent) and keep_empty is false + or components is the result of the urlparse() call with allow_none=False.""" scheme, netloc, url, params, query, fragment, _coerce_result = ( _coerce_args(*components)) + if keep_empty is _UNSPECIFIED: + keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT) + elif keep_empty and not getattr(components, '_keep_empty', True): + raise ValueError('Cannot distinguish between empty and not defined ' + 'URI components in the result of parsing URL with ' + 'allow_none=False') if not keep_empty: if not netloc: if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): @@ -572,14 +581,22 @@ def urlunparse(components, *, keep_empty=_ALLOW_NONE_DEFAULT): url = "%s;%s" % (url, params) return _coerce_result(_urlunsplit(scheme, netloc, url, query, fragment)) -def urlunsplit(components, *, keep_empty=_ALLOW_NONE_DEFAULT): +def urlunsplit(components, *, keep_empty=_UNSPECIFIED): """Combine the elements of a tuple as returned by urlsplit() into a complete URL as a string. The data argument can be any five-item iterable. This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had unnecessary delimiters (for example, a ? with an - empty query; the RFC states that these are equivalent).""" + empty query; the RFC states that these are equivalent) and keep_empty + is false or components is the result of the urlsplit() call with + allow_none=False.""" scheme, netloc, url, query, fragment, _coerce_result = ( _coerce_args(*components)) + if keep_empty is _UNSPECIFIED: + keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT) + elif keep_empty and not getattr(components, '_keep_empty', True): + raise ValueError('Cannot distinguish between empty and not defined ' + 'URI components in the result of parsing URL with ' + 'allow_none=False') if not keep_empty: if not netloc: if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): @@ -692,7 +709,9 @@ def urldefrag(url, *, allow_none=_ALLOW_NONE_DEFAULT): frag = None defrag = url if not allow_none and frag is None: frag = '' - return _coerce_result(DefragResult(defrag, frag)) + result = _coerce_result(DefragResult(defrag, frag)) + result._keep_empty = allow_none + return result _hexdig = '0123456789ABCDEFabcdef' _hextobyte = None diff --git a/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst b/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst new file mode 100644 index 00000000000000..86a7e754d0aae2 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst @@ -0,0 +1,6 @@ +Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`, +:func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag` +functions. Add the *keep_empty* parameter to +:func:`~urllib.parse.urlunparse` and :func:`~urllib.parse.urlunsplit` +functions. This allows to distinguish between empty and not defined URI +components and preserve empty components. From e5c31dd38d8679ffddafa54283a9549227b6c3a6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 5 Dec 2024 13:08:38 +0200 Subject: [PATCH 4/7] Preserve _keep_empty in copying and encoding. --- Lib/test/test_urlparse.py | 123 ++++++++++++++++++++++++-------------- Lib/urllib/parse.py | 44 ++++++++++++-- 2 files changed, 118 insertions(+), 49 deletions(-) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index f9c583710e43a7..626f4a59e7ba82 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1,9 +1,10 @@ +import copy import functools import sys import unicodedata import unittest import urllib.parse -from urllib.parse import urlparse, urlsplit, urlunparse, urlunsplit +from urllib.parse import urldefrag, urlparse, urlsplit, urlunparse, urlunsplit RFC1808_BASE = "http://a/b/c/d;p?q#f" RFC2396_BASE = "http://a/b/c/d;p?q" @@ -391,14 +392,14 @@ def checkJoin(self, base, relurl, expected, *, relroundtrip=True): self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) if relroundtrip: - relurl2 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl)) + relurl2 = urlunsplit(urlsplit(relurl)) self.assertEqual(urllib.parse.urljoin(base, relurl2), expected) - relurlb2 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb)) + relurlb2 = urlunsplit(urlsplit(relurlb)) self.assertEqual(urllib.parse.urljoin(baseb, relurlb2), expectedb) - relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True)) + relurl3 = urlunsplit(urlsplit(relurl, allow_none=True)) self.assertEqual(urllib.parse.urljoin(base, relurl3), expected) - relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True)) + relurlb3 = urlunsplit(urlsplit(relurlb, allow_none=True)) self.assertEqual(urllib.parse.urljoin(baseb, relurlb3), expectedb) def test_unparse_parse(self): @@ -458,9 +459,9 @@ def test_RFC1808(self): def test_RFC2368(self): # Issue 11467: path that starts with a number is not parsed correctly - self.assertEqual(urllib.parse.urlparse('mailto:1337@example.org'), + self.assertEqual(urlparse('mailto:1337@example.org'), ('mailto', '', '1337@example.org', '', '', '')) - self.assertEqual(urllib.parse.urlparse('mailto:1337@example.org', allow_none=True), + self.assertEqual(urlparse('mailto:1337@example.org', allow_none=True), ('mailto', None, '1337@example.org', None, None, None)) def test_RFC2396(self): @@ -1119,50 +1120,50 @@ def test_withoutscheme(self, allow_none): # RFC 1808 specifies that netloc should start with //, urlparse expects # the same, otherwise it classifies the portion of url as path. none = None if allow_none else '' - self.assertEqual(urllib.parse.urlparse("path", allow_none=allow_none), + self.assertEqual(urlparse("path", allow_none=allow_none), (none, none, 'path', none, none, none)) - self.assertEqual(urllib.parse.urlparse("//www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse("//www.python.org:80", allow_none=allow_none), (none, 'www.python.org:80', '', none, none, none)) - self.assertEqual(urllib.parse.urlparse("http://www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse("http://www.python.org:80", allow_none=allow_none), ('http', 'www.python.org:80', '', none, none, none)) # Repeat for bytes input none = None if allow_none else b'' - self.assertEqual(urllib.parse.urlparse(b"path", allow_none=allow_none), + self.assertEqual(urlparse(b"path", allow_none=allow_none), (none, none, b'path', none, none, none)) - self.assertEqual(urllib.parse.urlparse(b"//www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse(b"//www.python.org:80", allow_none=allow_none), (none, b'www.python.org:80', b'', none, none, none)) - self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse(b"http://www.python.org:80", allow_none=allow_none), (b'http', b'www.python.org:80', b'', none, none, none)) @parametrise_allow_none def test_portseparator(self, allow_none): # Issue 754016 makes changes for port separator ':' from scheme separator none = None if allow_none else '' - self.assertEqual(urllib.parse.urlparse("http:80", allow_none=allow_none), + self.assertEqual(urlparse("http:80", allow_none=allow_none), ('http', none, '80', none, none, none)) - self.assertEqual(urllib.parse.urlparse("https:80", allow_none=allow_none), + self.assertEqual(urlparse("https:80", allow_none=allow_none), ('https', none, '80', none, none, none)) - self.assertEqual(urllib.parse.urlparse("path:80", allow_none=allow_none), + self.assertEqual(urlparse("path:80", allow_none=allow_none), ('path', none, '80', none, none, none)) - self.assertEqual(urllib.parse.urlparse("http:", allow_none=allow_none), + self.assertEqual(urlparse("http:", allow_none=allow_none), ('http', none, '', none, none, none)) - self.assertEqual(urllib.parse.urlparse("https:", allow_none=allow_none), + self.assertEqual(urlparse("https:", allow_none=allow_none), ('https', none, '', none, none, none)) - self.assertEqual(urllib.parse.urlparse("http://www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse("http://www.python.org:80", allow_none=allow_none), ('http', 'www.python.org:80', '', none, none, none)) # As usual, need to check bytes input as well none = None if allow_none else b'' - self.assertEqual(urllib.parse.urlparse(b"http:80", allow_none=allow_none), + self.assertEqual(urlparse(b"http:80", allow_none=allow_none), (b'http', none, b'80', none, none, none)) - self.assertEqual(urllib.parse.urlparse(b"https:80", allow_none=allow_none), + self.assertEqual(urlparse(b"https:80", allow_none=allow_none), (b'https', none, b'80', none, none, none)) - self.assertEqual(urllib.parse.urlparse(b"path:80", allow_none=allow_none), + self.assertEqual(urlparse(b"path:80", allow_none=allow_none), (b'path', none, b'80', none, none, none)) - self.assertEqual(urllib.parse.urlparse(b"http:", allow_none=allow_none), + self.assertEqual(urlparse(b"http:", allow_none=allow_none), (b'http', none, b'', none, none, none)) - self.assertEqual(urllib.parse.urlparse(b"https:", allow_none=allow_none), + self.assertEqual(urlparse(b"https:", allow_none=allow_none), (b'https', none, b'', none, none, none)) - self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse(b"http://www.python.org:80", allow_none=allow_none), (b'http', b'www.python.org:80', b'', none, none, none)) def test_usingsys(self): @@ -1173,24 +1174,24 @@ def test_usingsys(self): def test_anyscheme(self, allow_none): # Issue 7904: s3://foo.com/stuff has netloc "foo.com". none = None if allow_none else '' - self.assertEqual(urllib.parse.urlparse("s3://foo.com/stuff", allow_none=allow_none), + self.assertEqual(urlparse("s3://foo.com/stuff", allow_none=allow_none), ('s3', 'foo.com', '/stuff', none, none, none)) - self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff", allow_none=allow_none), + self.assertEqual(urlparse("x-newscheme://foo.com/stuff", allow_none=allow_none), ('x-newscheme', 'foo.com', '/stuff', none, none, none)) - self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff?query#fragment", allow_none=allow_none), + self.assertEqual(urlparse("x-newscheme://foo.com/stuff?query#fragment", allow_none=allow_none), ('x-newscheme', 'foo.com', '/stuff', none, 'query', 'fragment')) - self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff?query", allow_none=allow_none), + self.assertEqual(urlparse("x-newscheme://foo.com/stuff?query", allow_none=allow_none), ('x-newscheme', 'foo.com', '/stuff', none, 'query', none)) # And for bytes... none = None if allow_none else b'' - self.assertEqual(urllib.parse.urlparse(b"s3://foo.com/stuff", allow_none=allow_none), + self.assertEqual(urlparse(b"s3://foo.com/stuff", allow_none=allow_none), (b's3', b'foo.com', b'/stuff', none, none, none)) - self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff", allow_none=allow_none), + self.assertEqual(urlparse(b"x-newscheme://foo.com/stuff", allow_none=allow_none), (b'x-newscheme', b'foo.com', b'/stuff', none, none, none)) - self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff?query#fragment", allow_none=allow_none), + self.assertEqual(urlparse(b"x-newscheme://foo.com/stuff?query#fragment", allow_none=allow_none), (b'x-newscheme', b'foo.com', b'/stuff', none, b'query', b'fragment')) - self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff?query", allow_none=allow_none), + self.assertEqual(urlparse(b"x-newscheme://foo.com/stuff?query", allow_none=allow_none), (b'x-newscheme', b'foo.com', b'/stuff', none, b'query', none)) def test_default_scheme(self): @@ -1274,12 +1275,10 @@ def test_mixed_types_rejected(self): with self.assertRaisesRegex(TypeError, "Cannot mix str"): urllib.parse.urljoin(b"http://python.org", "http://python.org") - def _check_result_type(self, str_type): - num_args = len(str_type._fields) + def _check_result_type(self, str_type, str_args): bytes_type = str_type._encoded_counterpart self.assertIs(bytes_type._decoded_counterpart, str_type) - str_args = ('',) * num_args - bytes_args = (b'',) * num_args + bytes_args = tuple(self._encode(s) for s in str_args) str_result = str_type(*str_args) bytes_result = bytes_type(*bytes_args) encoding = 'ascii' @@ -1298,16 +1297,52 @@ def _check_result_type(self, str_type): self.assertEqual(str_result.encode(encoding), bytes_result) self.assertEqual(str_result.encode(encoding, errors), bytes_args) self.assertEqual(str_result.encode(encoding, errors), bytes_result) + for result in str_result, bytes_result: + self.assertEqual(copy.copy(result), result) + self.assertEqual(copy.deepcopy(result), result) + self.assertEqual(copy.replace(result), result) + self.assertEqual(result._replace(), result) def test_result_pairs(self): # Check encoding and decoding between result pairs - result_types = [ - urllib.parse.DefragResult, - urllib.parse.SplitResult, - urllib.parse.ParseResult, - ] - for result_type in result_types: - self._check_result_type(result_type) + self._check_result_type(urllib.parse.DefragResult, ('', '')) + self._check_result_type(urllib.parse.DefragResult, ('', None)) + self._check_result_type(urllib.parse.SplitResult, ('', '', '', '', '')) + self._check_result_type(urllib.parse.SplitResult, (None, None, '', None, None)) + self._check_result_type(urllib.parse.ParseResult, ('', '', '', '', '', '')) + self._check_result_type(urllib.parse.ParseResult, (None, None, '', None, None, None)) + + def test_result_encoding_decoding(self): + def check(str_result, bytes_result): + self.assertEqual(str_result.encode(), bytes_result) + self.assertEqual(str_result.encode().geturl(), bytes_result.geturl()) + self.assertEqual(bytes_result.decode(), str_result) + self.assertEqual(bytes_result.decode().geturl(), str_result.geturl()) + + url = 'http://example.com/?#' + burl = url.encode() + for func in urldefrag, urlsplit, urlparse: + check(func(url, allow_none=True), func(burl, allow_none=True)) + check(func(url), func(burl)) + + def test_result_copying(self): + def check(result): + self.assertEqual(copy.copy(result), result) + self.assertEqual(copy.copy(result).geturl(), result.geturl()) + self.assertEqual(copy.deepcopy(result), result) + self.assertEqual(copy.deepcopy(result).geturl(), result.geturl()) + self.assertEqual(copy.replace(result), result) + self.assertEqual(copy.replace(result).geturl(), result.geturl()) + self.assertEqual(result._replace(), result) + self.assertEqual(result._replace().geturl(), result.geturl()) + + url = 'http://example.com/?#' + burl = url.encode() + for func in urldefrag, urlsplit, urlparse: + check(func(url)) + check(func(url, allow_none=True)) + check(func(burl)) + check(func(burl, allow_none=True)) def test_parse_qs_encoding(self): result = urllib.parse.parse_qs("key=\u0141%E9", encoding="latin-1") diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 8d2a05bd134135..86db54e9b14fe7 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -146,9 +146,14 @@ class _ResultMixinStr(object): __slots__ = () def encode(self, encoding='ascii', errors='strict'): - return self._encoded_counterpart(*(x.encode(encoding, errors) + result = self._encoded_counterpart(*(x.encode(encoding, errors) if x is not None else None for x in self)) + try: + result._keep_empty = self._keep_empty + except AttributeError: + pass + return result class _ResultMixinBytes(object): @@ -156,9 +161,14 @@ class _ResultMixinBytes(object): __slots__ = () def decode(self, encoding='ascii', errors='strict'): - return self._decoded_counterpart(*(x.decode(encoding, errors) + result = self._decoded_counterpart(*(x.decode(encoding, errors) if x is not None else None for x in self)) + try: + result._keep_empty = self._keep_empty + except AttributeError: + pass + return result class _NetlocResultMixinBase(object): @@ -270,7 +280,31 @@ def _hostinfo(self): _UNSPECIFIED = ['not specified'] _ALLOW_NONE_DEFAULT = False -class _DefragResultBase(namedtuple('_DefragResultBase', 'url fragment')): +class _ResultBase: + def __replace__(self, /, **kwargs): + result = super().__replace__(**kwargs) + try: + result._keep_empty = self._keep_empty + except AttributeError: + pass + return result + + def _replace(self, /, **kwargs): + result = super()._replace(**kwargs) + try: + result._keep_empty = self._keep_empty + except AttributeError: + pass + return result + + def __copy__(self): + return self + + def __deepcopy__(self, memo): + return self + + +class _DefragResultBase(_ResultBase, namedtuple('_DefragResultBase', 'url fragment')): def geturl(self): if self.fragment or (self.fragment is not None and getattr(self, '_keep_empty', _ALLOW_NONE_DEFAULT)): @@ -278,12 +312,12 @@ def geturl(self): else: return self.url -class _SplitResultBase(namedtuple( +class _SplitResultBase(_ResultBase, namedtuple( '_SplitResultBase', 'scheme netloc path query fragment')): def geturl(self): return urlunsplit(self) -class _ParseResultBase(namedtuple( +class _ParseResultBase(_ResultBase, namedtuple( '_ParseResultBase', 'scheme netloc path params query fragment')): def geturl(self): return urlunparse(self) From 5846bf2dde7d3e96ba7e273a2c42be1a0bc896ee Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 14 Nov 2025 21:16:16 +0200 Subject: [PATCH 5/7] Remove redundant import. --- Lib/test/test_urlparse.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index e7bc72c86a4870..dd2b16866101b1 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1,5 +1,4 @@ import copy -import functools import sys import unicodedata import unittest From 7d59b7ed22336618f662ccd8c2eb61332bd75d3f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 17 Nov 2025 18:28:54 +0200 Subject: [PATCH 6/7] Rename allow_none to missing_as_none. --- Doc/library/urllib.parse.rst | 45 ++--- Doc/whatsnew/3.15.rst | 2 +- Lib/test/test_urlparse.py | 158 +++++++++--------- Lib/urllib/parse.py | 43 ++--- ...4-11-27-13-11-16.gh-issue-67041.ym2WKK.rst | 2 +- 5 files changed, 126 insertions(+), 124 deletions(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 17ff0c0c6b6666..01af538dd3816e 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -50,14 +50,15 @@ URL Parsing The URL parsing functions focus on splitting a URL string into its components, or on combining URL components into a URL string. -.. function:: urlparse(urlstring, scheme=None, allow_fragments=True, *, allow_none=False) +.. function:: urlparse(urlstring, scheme=None, allow_fragments=True, *, missing_as_none=False) Parse a URL into six components, returning a 6-item :term:`named tuple`. This corresponds to the general structure of a URL: ``scheme://netloc/path;parameters?query#fragment``. - Each tuple item is a string, possibly empty, or ``None`` if *allow_none* is true. + Each tuple item is a string, possibly empty, or ``None`` if + *missing_as_none* is true. Not defined component are represented an empty string (by default) or - ``None`` if *allow_none* is true. + ``None`` if *missing_as_none* is true. The components are not broken up into smaller parts (for example, the network location is a single string), and % escapes are not expanded. The delimiters as shown above are not part of the @@ -90,7 +91,7 @@ or on combining URL components into a URL string. >>> urlparse("http://docs.python.org?") ParseResult(scheme='http', netloc='docs.python.org', path='', params='', query='', fragment='') - >>> urlparse("http://docs.python.org?", allow_none=True) + >>> urlparse("http://docs.python.org?", missing_as_none=True) ParseResult(scheme='http', netloc='docs.python.org', path='', params=None, query='', fragment=None) @@ -112,7 +113,7 @@ or on combining URL components into a URL string. >>> urlparse('help/Python.html') ParseResult(scheme='', netloc='', path='help/Python.html', params='', query='', fragment='') - >>> urlparse('help/Python.html', allow_none=True) + >>> urlparse('help/Python.html', missing_as_none=True) ParseResult(scheme=None, netloc=None, path='help/Python.html', params=None, query=None, fragment=None) @@ -124,7 +125,7 @@ or on combining URL components into a URL string. If the *allow_fragments* argument is false, fragment identifiers are not recognized. Instead, they are parsed as part of the path, parameters or query component, and :attr:`fragment` is set to ``None`` or the empty - string (depending on the value of *allow_none*) in the return value. + string (depending on the value of *missing_as_none*) in the return value. The return value is a :term:`named tuple`, which means that its items can be accessed by index or as named attributes, which are: @@ -156,7 +157,7 @@ or on combining URL components into a URL string. | | | if present | | +------------------+-------+-------------------------+-------------------------------+ - .. [1] Depending on the value of the *allow_none* argument. + .. [1] Depending on the value of the *missing_as_none* argument. Reading the :attr:`port` attribute will raise a :exc:`ValueError` if an invalid port is specified in the URL. See section @@ -209,7 +210,7 @@ or on combining URL components into a URL string. now raise :exc:`ValueError`. .. versionchanged:: next - Added the *allow_none* parameter. + Added the *missing_as_none* parameter. .. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None, separator='&') @@ -317,15 +318,15 @@ or on combining URL components into a URL string. If *keep_empty* is true, empty strings are kept in the result (for example, a ``?`` for an empty query), only ``None`` components are omitted. This allows to restore the URL that was parsed with option - ``allow_none=True``. + ``missing_as_none=True``. By default, *keep_empty* is true if *parts* is the result of the - :func:`urlparse` call with ``allow_none=True``. + :func:`urlparse` call with ``missing_as_none=True``. .. versionchanged:: next Added the *keep_empty* parameter. -.. function:: urlsplit(urlstring, scheme=None, allow_fragments=True, *, allow_none=False) +.. function:: urlsplit(urlstring, scheme=None, allow_fragments=True, *, missing_as_none=False) This is similar to :func:`urlparse`, but does not split the params from the URL. This should generally be used instead of :func:`urlparse` if the more recent URL @@ -363,7 +364,7 @@ or on combining URL components into a URL string. | | | if present | | +------------------+-------+-------------------------+-------------------------------+ - .. [2] Depending on the value of the *allow_none* argument. + .. [2] Depending on the value of the *missing_as_none* argument. Reading the :attr:`port` attribute will raise a :exc:`ValueError` if an invalid port is specified in the URL. See section @@ -401,7 +402,7 @@ or on combining URL components into a URL string. Leading WHATWG C0 control and space characters are stripped from the URL. .. versionchanged:: next - Added the *allow_none* parameter. + Added the *missing_as_none* parameter. .. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser @@ -418,9 +419,9 @@ or on combining URL components into a URL string. If *keep_empty* is true, empty strings are kept in the result (for example, a ``?`` for an empty query), only ``None`` components are omitted. This allows to restore the URL that was parsed with option - ``allow_none=True``. + ``missing_as_none=True``. By default, *keep_empty* is true if *parts* is the result of the - :func:`urlsplit` call with ``allow_none=True``. + :func:`urlsplit` call with ``missing_as_none=True``. .. versionchanged:: next Added the *keep_empty* parameter. @@ -469,12 +470,12 @@ or on combining URL components into a URL string. Behavior updated to match the semantics defined in :rfc:`3986`. -.. function:: urldefrag(url, *, allow_none=False) +.. function:: urldefrag(url, *, missing_as_none=False) If *url* contains a fragment identifier, return a modified version of *url* with no fragment identifier, and the fragment identifier as a separate string. If there is no fragment identifier in *url*, return *url* unmodified - and an empty string (by default) or ``None`` if *allow_none* is true. + and an empty string (by default) or ``None`` if *missing_as_none* is true. The return value is a :term:`named tuple`, its items can be accessed by index or as named attributes: @@ -487,7 +488,7 @@ or on combining URL components into a URL string. | :attr:`fragment` | 1 | Fragment identifier | ``None`` or empty string [3]_ | +------------------+-------+-------------------------+-------------------------------+ - .. [3] Depending on the value of the *allow_none* argument. + .. [3] Depending on the value of the *missing_as_none* argument. See section :ref:`urlparse-result-object` for more information on the result object. @@ -496,7 +497,7 @@ or on combining URL components into a URL string. Result is a structured object rather than a simple 2-tuple. .. versionchanged:: next - Added the *allow_none* parameter. + Added the *missing_as_none* parameter. .. function:: unwrap(url) @@ -518,7 +519,7 @@ purity. Instead of raising an exception on unusual input, they may instead return some component parts as empty strings or ``None`` (depending on the value of the -*allow_none* argument). +*missing_as_none* argument). Or components may contain more than perhaps they should. We recommend that users of these APIs where the values may be used anywhere @@ -596,7 +597,7 @@ previous section, as well as an additional method: differ from the original URL in that the scheme may be normalized to lower case and empty components may be dropped. Specifically, empty parameters, queries, and fragment identifiers will be removed unless the URL was parsed - with ``allow_none=True``. + with ``missing_as_none=True``. For :func:`urldefrag` results, only empty fragment identifiers will be removed. For :func:`urlsplit` and :func:`urlparse` results, all noted changes will be @@ -613,7 +614,7 @@ previous section, as well as an additional method: >>> r2 = urlsplit(r1.geturl()) >>> r2.geturl() 'http://www.Python.org/doc/' - >>> r3 = urlsplit(url, allow_none=True) + >>> r3 = urlsplit(url, missing_as_none=True) >>> r3.geturl() 'http://www.Python.org/doc/#' diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 3acc478ca6c0ba..d59721ae3d9267 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -779,7 +779,7 @@ unittest urllib.parse ------------ -* Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`, +* Add the *missing_as_none* parameter to :func:`~urllib.parse.urlparse`, :func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag` functions. Add the *keep_empty* parameter to :func:`~urllib.parse.urlunparse` and :func:`~urllib.parse.urlunsplit` functions. diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index dd2b16866101b1..207930a2cb9a77 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -109,32 +109,32 @@ class UrlParseTestCase(unittest.TestCase): def checkRoundtrips(self, url, parsed, split, url2=None): if url2 is None: url2 = url - self.checkRoundtrips1(url, parsed, split, allow_none=True) + self.checkRoundtrips1(url, parsed, split, missing_as_none=True) empty = url[:0] parsed = tuple(x or empty for x in parsed) split = tuple(x or empty for x in split) - self.checkRoundtrips1(url, parsed, split, url2, allow_none=False) + self.checkRoundtrips1(url, parsed, split, url2, missing_as_none=False) - result = urlparse(url, allow_none=True) + result = urlparse(url, missing_as_none=True) self.assertEqual(urlunparse(result, keep_empty=False), url2) self.assertEqual(urlunparse(tuple(result), keep_empty=False), url2) - result = urlparse(url, allow_none=False) + result = urlparse(url, missing_as_none=False) with self.assertRaises(ValueError): urlunparse(result, keep_empty=True) urlunparse(tuple(result), keep_empty=True) - result = urlsplit(url, allow_none=True) + result = urlsplit(url, missing_as_none=True) self.assertEqual(urlunsplit(result, keep_empty=False), url2) self.assertEqual(urlunsplit(tuple(result), keep_empty=False), url2) - result = urlsplit(url, allow_none=False) + result = urlsplit(url, missing_as_none=False) with self.assertRaises(ValueError): urlunsplit(result, keep_empty=True) urlunsplit(tuple(result), keep_empty=True) - def checkRoundtrips1(self, url, parsed, split, url2=None, *, allow_none): + def checkRoundtrips1(self, url, parsed, split, url2=None, *, missing_as_none): if url2 is None: url2 = url - result = urlparse(url, allow_none=allow_none) + result = urlparse(url, missing_as_none=missing_as_none) self.assertSequenceEqual(result, parsed) t = (result.scheme, result.netloc, result.path, result.params, result.query, result.fragment) @@ -143,12 +143,12 @@ def checkRoundtrips1(self, url, parsed, split, url2=None, *, allow_none): result2 = urlunparse(result) self.assertEqual(result2, url2) self.assertEqual(result2, result.geturl()) - self.assertEqual(urlunparse(result, keep_empty=allow_none), url2) - self.assertEqual(urlunparse(tuple(result), keep_empty=allow_none), result2) + self.assertEqual(urlunparse(result, keep_empty=missing_as_none), url2) + self.assertEqual(urlunparse(tuple(result), keep_empty=missing_as_none), result2) # the result of geturl() is a fixpoint; we can always parse it # again to get the same result: - result3 = urlparse(result.geturl(), allow_none=allow_none) + result3 = urlparse(result.geturl(), missing_as_none=missing_as_none) self.assertEqual(result3.geturl(), result.geturl()) self.assertSequenceEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) @@ -163,7 +163,7 @@ def checkRoundtrips1(self, url, parsed, split, url2=None, *, allow_none): self.assertEqual(result3.port, result.port) # check the roundtrip using urlsplit() as well - result = urlsplit(url, allow_none=allow_none) + result = urlsplit(url, missing_as_none=missing_as_none) self.assertSequenceEqual(result, split) t = (result.scheme, result.netloc, result.path, result.query, result.fragment) @@ -171,10 +171,10 @@ def checkRoundtrips1(self, url, parsed, split, url2=None, *, allow_none): result2 = urlunsplit(result) self.assertEqual(result2, url2) self.assertEqual(result2, result.geturl()) - self.assertEqual(urlunsplit(tuple(result), keep_empty=allow_none), result2) + self.assertEqual(urlunsplit(tuple(result), keep_empty=missing_as_none), result2) # check the fixpoint property of re-parsing the result of geturl() - result3 = urlsplit(result.geturl(), allow_none=allow_none) + result3 = urlsplit(result.geturl(), missing_as_none=missing_as_none) self.assertEqual(result3.geturl(), result.geturl()) self.assertSequenceEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) @@ -383,9 +383,9 @@ def checkJoin(self, base, relurl, expected, *, relroundtrip=True): relurlb2 = urlunsplit(urlsplit(relurlb)) self.assertEqual(urllib.parse.urljoin(baseb, relurlb2), expectedb) - relurl3 = urlunsplit(urlsplit(relurl, allow_none=True)) + relurl3 = urlunsplit(urlsplit(relurl, missing_as_none=True)) self.assertEqual(urllib.parse.urljoin(base, relurl3), expected) - relurlb3 = urlunsplit(urlsplit(relurlb, allow_none=True)) + relurlb3 = urlunsplit(urlsplit(relurlb, missing_as_none=True)) self.assertEqual(urllib.parse.urljoin(baseb, relurlb3), expectedb) @support.subTests('bytes', (False, True)) @@ -448,7 +448,7 @@ def test_RFC2368(self): # Issue 11467: path that starts with a number is not parsed correctly self.assertEqual(urlparse('mailto:1337@example.org'), ('mailto', '', '1337@example.org', '', '', '')) - self.assertEqual(urlparse('mailto:1337@example.org', allow_none=True), + self.assertEqual(urlparse('mailto:1337@example.org', missing_as_none=True), ('mailto', None, '1337@example.org', None, None, None)) def test_RFC2396(self): @@ -801,14 +801,14 @@ def test_RFC2732_invalid(self, bytes, invalid_url): ('//a/b/c;p?q#f', '//a/b/c;p?q', 'f'), ('://a/b/c;p?q#f', '://a/b/c;p?q', 'f'), ]) - @support.subTests('allow_none', (False, True)) - def test_urldefrag(self, bytes, url, defrag, frag, allow_none): + @support.subTests('missing_as_none', (False, True)) + def test_urldefrag(self, bytes, url, defrag, frag, missing_as_none): if bytes: url = str_encode(url) defrag = str_encode(defrag) frag = str_encode(frag) - result = urllib.parse.urldefrag(url, allow_none=allow_none) - if not allow_none: + result = urllib.parse.urldefrag(url, missing_as_none=missing_as_none) + if not missing_as_none: hash = '#' if isinstance(url, str) else b'#' url = url.rstrip(hash) if frag is None: @@ -1043,27 +1043,27 @@ def test_attributes_bad_scheme(self, bytes, parse, scheme): if not url.isascii(): self.skipTest('non-ASCII bytes') url = url.encode("ascii") - p = parse(url, allow_none=True) + p = parse(url, missing_as_none=True) self.assertIsNone(p.scheme) - @support.subTests('allow_none', (False, True)) - def test_attributes_without_netloc(self, allow_none): + @support.subTests('missing_as_none', (False, True)) + def test_attributes_without_netloc(self, missing_as_none): # This example is straight from RFC 3261. It looks like it # should allow the username, hostname, and port to be filled # in, but doesn't. Since it's a URI and doesn't use the # scheme://netloc syntax, the netloc and related attributes # should be left empty. uri = "sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15" - p = urllib.parse.urlsplit(uri, allow_none=allow_none) - self.assertEqual(p.netloc, None if allow_none else "") + p = urllib.parse.urlsplit(uri, missing_as_none=missing_as_none) + self.assertEqual(p.netloc, None if missing_as_none else "") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, None) self.assertEqual(p.port, None) self.assertEqual(p.geturl(), uri) - p = urllib.parse.urlparse(uri, allow_none=allow_none) - self.assertEqual(p.netloc, None if allow_none else "") + p = urllib.parse.urlparse(uri, missing_as_none=missing_as_none) + self.assertEqual(p.netloc, None if missing_as_none else "") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, None) @@ -1072,16 +1072,16 @@ def test_attributes_without_netloc(self, allow_none): # You guessed it, repeating the test with bytes input uri = b"sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15" - p = urllib.parse.urlsplit(uri, allow_none=allow_none) - self.assertEqual(p.netloc, None if allow_none else b"") + p = urllib.parse.urlsplit(uri, missing_as_none=missing_as_none) + self.assertEqual(p.netloc, None if missing_as_none else b"") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, None) self.assertEqual(p.port, None) self.assertEqual(p.geturl(), uri) - p = urllib.parse.urlparse(uri, allow_none=allow_none) - self.assertEqual(p.netloc, None if allow_none else b"") + p = urllib.parse.urlparse(uri, missing_as_none=missing_as_none) + self.assertEqual(p.netloc, None if missing_as_none else b"") self.assertEqual(p.username, None) self.assertEqual(p.password, None) self.assertEqual(p.hostname, None) @@ -1095,85 +1095,85 @@ def test_noslash(self): self.assertEqual(urllib.parse.urlparse(b"http://example.com?blahblah=/foo"), (b'http', b'example.com', b'', b'', b'blahblah=/foo', b'')) - @support.subTests('allow_none', (False, True)) - def test_withoutscheme(self, allow_none): + @support.subTests('missing_as_none', (False, True)) + def test_withoutscheme(self, missing_as_none): # Test urlparse without scheme # Issue 754016: urlparse goes wrong with IP:port without scheme # RFC 1808 specifies that netloc should start with //, urlparse expects # the same, otherwise it classifies the portion of url as path. - none = None if allow_none else '' - self.assertEqual(urlparse("path", allow_none=allow_none), + none = None if missing_as_none else '' + self.assertEqual(urlparse("path", missing_as_none=missing_as_none), (none, none, 'path', none, none, none)) - self.assertEqual(urlparse("//www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse("//www.python.org:80", missing_as_none=missing_as_none), (none, 'www.python.org:80', '', none, none, none)) - self.assertEqual(urlparse("http://www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse("http://www.python.org:80", missing_as_none=missing_as_none), ('http', 'www.python.org:80', '', none, none, none)) # Repeat for bytes input - none = None if allow_none else b'' - self.assertEqual(urlparse(b"path", allow_none=allow_none), + none = None if missing_as_none else b'' + self.assertEqual(urlparse(b"path", missing_as_none=missing_as_none), (none, none, b'path', none, none, none)) - self.assertEqual(urlparse(b"//www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse(b"//www.python.org:80", missing_as_none=missing_as_none), (none, b'www.python.org:80', b'', none, none, none)) - self.assertEqual(urlparse(b"http://www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse(b"http://www.python.org:80", missing_as_none=missing_as_none), (b'http', b'www.python.org:80', b'', none, none, none)) - @support.subTests('allow_none', (False, True)) - def test_portseparator(self, allow_none): + @support.subTests('missing_as_none', (False, True)) + def test_portseparator(self, missing_as_none): # Issue 754016 makes changes for port separator ':' from scheme separator - none = None if allow_none else '' - self.assertEqual(urlparse("http:80", allow_none=allow_none), + none = None if missing_as_none else '' + self.assertEqual(urlparse("http:80", missing_as_none=missing_as_none), ('http', none, '80', none, none, none)) - self.assertEqual(urlparse("https:80", allow_none=allow_none), + self.assertEqual(urlparse("https:80", missing_as_none=missing_as_none), ('https', none, '80', none, none, none)) - self.assertEqual(urlparse("path:80", allow_none=allow_none), + self.assertEqual(urlparse("path:80", missing_as_none=missing_as_none), ('path', none, '80', none, none, none)) - self.assertEqual(urlparse("http:", allow_none=allow_none), + self.assertEqual(urlparse("http:", missing_as_none=missing_as_none), ('http', none, '', none, none, none)) - self.assertEqual(urlparse("https:", allow_none=allow_none), + self.assertEqual(urlparse("https:", missing_as_none=missing_as_none), ('https', none, '', none, none, none)) - self.assertEqual(urlparse("http://www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse("http://www.python.org:80", missing_as_none=missing_as_none), ('http', 'www.python.org:80', '', none, none, none)) # As usual, need to check bytes input as well - none = None if allow_none else b'' - self.assertEqual(urlparse(b"http:80", allow_none=allow_none), + none = None if missing_as_none else b'' + self.assertEqual(urlparse(b"http:80", missing_as_none=missing_as_none), (b'http', none, b'80', none, none, none)) - self.assertEqual(urlparse(b"https:80", allow_none=allow_none), + self.assertEqual(urlparse(b"https:80", missing_as_none=missing_as_none), (b'https', none, b'80', none, none, none)) - self.assertEqual(urlparse(b"path:80", allow_none=allow_none), + self.assertEqual(urlparse(b"path:80", missing_as_none=missing_as_none), (b'path', none, b'80', none, none, none)) - self.assertEqual(urlparse(b"http:", allow_none=allow_none), + self.assertEqual(urlparse(b"http:", missing_as_none=missing_as_none), (b'http', none, b'', none, none, none)) - self.assertEqual(urlparse(b"https:", allow_none=allow_none), + self.assertEqual(urlparse(b"https:", missing_as_none=missing_as_none), (b'https', none, b'', none, none, none)) - self.assertEqual(urlparse(b"http://www.python.org:80", allow_none=allow_none), + self.assertEqual(urlparse(b"http://www.python.org:80", missing_as_none=missing_as_none), (b'http', b'www.python.org:80', b'', none, none, none)) def test_usingsys(self): # Issue 3314: sys module is used in the error self.assertRaises(TypeError, urllib.parse.urlencode, "foo") - @support.subTests('allow_none', (False, True)) - def test_anyscheme(self, allow_none): + @support.subTests('missing_as_none', (False, True)) + def test_anyscheme(self, missing_as_none): # Issue 7904: s3://foo.com/stuff has netloc "foo.com". - none = None if allow_none else '' - self.assertEqual(urlparse("s3://foo.com/stuff", allow_none=allow_none), + none = None if missing_as_none else '' + self.assertEqual(urlparse("s3://foo.com/stuff", missing_as_none=missing_as_none), ('s3', 'foo.com', '/stuff', none, none, none)) - self.assertEqual(urlparse("x-newscheme://foo.com/stuff", allow_none=allow_none), + self.assertEqual(urlparse("x-newscheme://foo.com/stuff", missing_as_none=missing_as_none), ('x-newscheme', 'foo.com', '/stuff', none, none, none)) - self.assertEqual(urlparse("x-newscheme://foo.com/stuff?query#fragment", allow_none=allow_none), + self.assertEqual(urlparse("x-newscheme://foo.com/stuff?query#fragment", missing_as_none=missing_as_none), ('x-newscheme', 'foo.com', '/stuff', none, 'query', 'fragment')) - self.assertEqual(urlparse("x-newscheme://foo.com/stuff?query", allow_none=allow_none), + self.assertEqual(urlparse("x-newscheme://foo.com/stuff?query", missing_as_none=missing_as_none), ('x-newscheme', 'foo.com', '/stuff', none, 'query', none)) # And for bytes... - none = None if allow_none else b'' - self.assertEqual(urlparse(b"s3://foo.com/stuff", allow_none=allow_none), + none = None if missing_as_none else b'' + self.assertEqual(urlparse(b"s3://foo.com/stuff", missing_as_none=missing_as_none), (b's3', b'foo.com', b'/stuff', none, none, none)) - self.assertEqual(urlparse(b"x-newscheme://foo.com/stuff", allow_none=allow_none), + self.assertEqual(urlparse(b"x-newscheme://foo.com/stuff", missing_as_none=missing_as_none), (b'x-newscheme', b'foo.com', b'/stuff', none, none, none)) - self.assertEqual(urlparse(b"x-newscheme://foo.com/stuff?query#fragment", allow_none=allow_none), + self.assertEqual(urlparse(b"x-newscheme://foo.com/stuff?query#fragment", missing_as_none=missing_as_none), (b'x-newscheme', b'foo.com', b'/stuff', none, b'query', b'fragment')) - self.assertEqual(urlparse(b"x-newscheme://foo.com/stuff?query", allow_none=allow_none), + self.assertEqual(urlparse(b"x-newscheme://foo.com/stuff?query", missing_as_none=missing_as_none), (b'x-newscheme', b'foo.com', b'/stuff', none, b'query', none)) @support.subTests('func', (urllib.parse.urlparse, urllib.parse.urlsplit)) @@ -1187,11 +1187,11 @@ def test_default_scheme(self, func): self.assertEqual(func("path", scheme="ftp").scheme, "ftp") self.assertEqual(func(b"path", scheme=b"ftp").scheme, b"ftp") self.assertEqual(func("path").scheme, "") - self.assertEqual(func("path", allow_none=True).scheme, None) + self.assertEqual(func("path", missing_as_none=True).scheme, None) self.assertEqual(func(b"path").scheme, b"") - self.assertEqual(func(b"path", allow_none=True).scheme, None) + self.assertEqual(func(b"path", missing_as_none=True).scheme, None) self.assertEqual(func(b"path", "").scheme, b"") - self.assertEqual(func(b"path", "", allow_none=True).scheme, b"") + self.assertEqual(func(b"path", "", missing_as_none=True).scheme, b"") @support.subTests('url,attr,expected_frag', ( ("http:#frag", "path", "frag"), @@ -1216,11 +1216,11 @@ def test_parse_fragments(self, url, attr, expected_frag, func): "#" + expected_frag) self.assertEqual(func(url, "", False).fragment, "") - result = func(url, allow_fragments=False, allow_none=True) + result = func(url, allow_fragments=False, missing_as_none=True) self.assertIsNone(result.fragment) self.assertTrue( getattr(result, attr).endswith("#" + expected_frag)) - self.assertIsNone(func(url, "", False, allow_none=True).fragment) + self.assertIsNone(func(url, "", False, missing_as_none=True).fragment) result = func(url, allow_fragments=True) self.assertEqual(result.fragment, expected_frag) @@ -1301,7 +1301,7 @@ def check(str_result, bytes_result): url = 'http://example.com/?#' burl = url.encode() for func in urldefrag, urlsplit, urlparse: - check(func(url, allow_none=True), func(burl, allow_none=True)) + check(func(url, missing_as_none=True), func(burl, missing_as_none=True)) check(func(url), func(burl)) def test_result_copying(self): @@ -1319,9 +1319,9 @@ def check(result): burl = url.encode() for func in urldefrag, urlsplit, urlparse: check(func(url)) - check(func(url, allow_none=True)) + check(func(url, missing_as_none=True)) check(func(burl)) - check(func(burl, allow_none=True)) + check(func(burl, missing_as_none=True)) def test_parse_qs_encoding(self): result = urllib.parse.parse_qs("key=\u0141%E9", encoding="latin-1") @@ -1566,7 +1566,7 @@ def test_telurl_params(self): self.assertEqual(p1.path, '+1-201-555-0123') self.assertEqual(p1.params, '') - p1 = urllib.parse.urlparse('tel:+1-201-555-0123', allow_none=True) + p1 = urllib.parse.urlparse('tel:+1-201-555-0123', missing_as_none=True) self.assertEqual(p1.scheme, 'tel') self.assertEqual(p1.path, '+1-201-555-0123') self.assertEqual(p1.params, None) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 451f2199b06200..f9d344b454e99c 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -276,7 +276,7 @@ def _hostinfo(self): _UNSPECIFIED = ['not specified'] -_ALLOW_NONE_DEFAULT = False +_MISSING_AS_NONE_DEFAULT = False class _ResultBase: def __replace__(self, /, **kwargs): @@ -305,7 +305,7 @@ def __deepcopy__(self, memo): class _DefragResultBase(_ResultBase, namedtuple('_DefragResultBase', 'url fragment')): def geturl(self): if self.fragment or (self.fragment is not None and - getattr(self, '_keep_empty', _ALLOW_NONE_DEFAULT)): + getattr(self, '_keep_empty', _MISSING_AS_NONE_DEFAULT)): return self.url + self._HASH + self.fragment else: return self.url @@ -423,7 +423,7 @@ def _fix_result_transcoding(): _fix_result_transcoding() del _fix_result_transcoding -def urlparse(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_DEFAULT): +def urlparse(url, scheme=None, allow_fragments=True, *, missing_as_none=_MISSING_AS_NONE_DEFAULT): """Parse a URL into 6 components: :///;?# @@ -447,7 +447,7 @@ def urlparse(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D if url is None: url = '' scheme, netloc, url, params, query, fragment = _urlparse(url, scheme, allow_fragments) - if not allow_none: + if not missing_as_none: if scheme is None: scheme = '' if netloc is None: netloc = '' if params is None: params = '' @@ -455,22 +455,22 @@ def urlparse(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D if fragment is None: fragment = '' result = ParseResult(scheme, netloc, url, params, query, fragment) result = _coerce_result(result) - result._keep_empty = allow_none + result._keep_empty = missing_as_none return result def _urlparse(url, scheme=None, allow_fragments=True): scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments) if (scheme or '') in uses_params and ';' in url: - url, params = _splitparams(url, allow_none=True) + url, params = _splitparams(url, missing_as_none=True) else: params = None return (scheme, netloc, url, params, query, fragment) -def _splitparams(url, allow_none=False): +def _splitparams(url, missing_as_none=False): if '/' in url: i = url.find(';', url.rfind('/')) if i < 0: - return url, None if allow_none else '' + return url, None if missing_as_none else '' else: i = url.find(';') return url[:i], url[i+1:] @@ -532,7 +532,7 @@ def _check_bracketed_host(hostname): # typed=True avoids BytesWarnings being emitted during cache key # comparison since this API supports both bytes and str input. @functools.lru_cache(typed=True) -def urlsplit(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_DEFAULT): +def urlsplit(url, scheme=None, allow_fragments=True, *, missing_as_none=_MISSING_AS_NONE_DEFAULT): """Parse a URL into 5 components: :///?# @@ -557,14 +557,14 @@ def urlsplit(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D if url is None: url = '' scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments) - if not allow_none: + if not missing_as_none: if scheme is None: scheme = '' if netloc is None: netloc = '' if query is None: query = '' if fragment is None: fragment = '' result = SplitResult(scheme, netloc, url, query, fragment) result = _coerce_result(result) - result._keep_empty = allow_none + result._keep_empty = missing_as_none return result def _urlsplit(url, scheme=None, allow_fragments=True): @@ -606,15 +606,16 @@ def urlunparse(components, *, keep_empty=_UNSPECIFIED): slightly different, but equivalent URL, if the URL that was parsed originally had redundant delimiters, e.g. a ? with an empty query (the draft states that these are equivalent) and keep_empty is false - or components is the result of the urlparse() call with allow_none=False.""" + or components is the result of the urlparse() call with + missing_as_none=False.""" scheme, netloc, url, params, query, fragment, _coerce_result = ( _coerce_args(*components)) if keep_empty is _UNSPECIFIED: - keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT) + keep_empty = getattr(components, '_keep_empty', _MISSING_AS_NONE_DEFAULT) elif keep_empty and not getattr(components, '_keep_empty', True): raise ValueError('Cannot distinguish between empty and not defined ' 'URI components in the result of parsing URL with ' - 'allow_none=False') + 'missing_as_none=False') if not keep_empty: if not netloc: if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): @@ -636,15 +637,15 @@ def urlunsplit(components, *, keep_empty=_UNSPECIFIED): was parsed originally had unnecessary delimiters (for example, a ? with an empty query; the RFC states that these are equivalent) and keep_empty is false or components is the result of the urlsplit() call with - allow_none=False.""" + missing_as_none=False.""" scheme, netloc, url, query, fragment, _coerce_result = ( _coerce_args(*components)) if keep_empty is _UNSPECIFIED: - keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT) + keep_empty = getattr(components, '_keep_empty', _MISSING_AS_NONE_DEFAULT) elif keep_empty and not getattr(components, '_keep_empty', True): raise ValueError('Cannot distinguish between empty and not defined ' 'URI components in the result of parsing URL with ' - 'allow_none=False') + 'missing_as_none=False') if not keep_empty: if not netloc: if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): @@ -742,12 +743,12 @@ def urljoin(base, url, allow_fragments=True): resolved_path) or '/', query, fragment)) -def urldefrag(url, *, allow_none=_ALLOW_NONE_DEFAULT): +def urldefrag(url, *, missing_as_none=_MISSING_AS_NONE_DEFAULT): """Removes any existing fragment from URL. Returns a tuple of the defragmented URL and the fragment. If the URL contained no fragments, the second element is the - empty string or None if allow_none is True. + empty string or None if missing_as_none is True. """ url, _coerce_result = _coerce_args(url) if '#' in url: @@ -756,9 +757,9 @@ def urldefrag(url, *, allow_none=_ALLOW_NONE_DEFAULT): else: frag = None defrag = url - if not allow_none and frag is None: frag = '' + if not missing_as_none and frag is None: frag = '' result = _coerce_result(DefragResult(defrag, frag)) - result._keep_empty = allow_none + result._keep_empty = missing_as_none return result _hexdig = '0123456789ABCDEFabcdef' diff --git a/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst b/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst index 86a7e754d0aae2..9ad1e28eac17c7 100644 --- a/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst +++ b/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst @@ -1,4 +1,4 @@ -Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`, +Add the *missing_as_none* parameter to :func:`~urllib.parse.urlparse`, :func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag` functions. Add the *keep_empty* parameter to :func:`~urllib.parse.urlunparse` and :func:`~urllib.parse.urlunsplit` From d025fa82338d60f14f9686d2d6c390e596c0030c Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 19 Nov 2025 10:12:55 +0200 Subject: [PATCH 7/7] Address review comments. --- Doc/library/urllib.parse.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 01af538dd3816e..ba6e46858f9d26 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -306,7 +306,8 @@ or on combining URL components into a URL string. separator key, with ``&`` as the default separator. -.. function:: urlunparse(parts, *, keep_empty=False) +.. function:: urlunparse(parts) + urlunparse(parts, *, keep_empty) Construct a URL from a tuple as returned by ``urlparse()``. The *parts* argument can be any six-item iterable. @@ -317,7 +318,7 @@ or on combining URL components into a URL string. If *keep_empty* is true, empty strings are kept in the result (for example, a ``?`` for an empty query), only ``None`` components are omitted. - This allows to restore the URL that was parsed with option + This allows rebuilding a URL that was parsed with option ``missing_as_none=True``. By default, *keep_empty* is true if *parts* is the result of the :func:`urlparse` call with ``missing_as_none=True``. @@ -406,7 +407,8 @@ or on combining URL components into a URL string. .. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser -.. function:: urlunsplit(parts, *, keep_empty=False) +.. function:: urlunsplit(parts) + urlunsplit(parts, *, keep_empty) Combine the elements of a tuple as returned by :func:`urlsplit` into a complete URL as a string. The *parts* argument can be any five-item @@ -418,7 +420,7 @@ or on combining URL components into a URL string. If *keep_empty* is true, empty strings are kept in the result (for example, a ``?`` for an empty query), only ``None`` components are omitted. - This allows to restore the URL that was parsed with option + This allows rebuilding a URL that was parsed with option ``missing_as_none=True``. By default, *keep_empty* is true if *parts* is the result of the :func:`urlsplit` call with ``missing_as_none=True``.