Preserve the status of allow_none in results.

serhiy-storchaka · serhiy-storchaka · commit eaa9ce6564ff · 2024-11-27T13:11:40.000+02:00
diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst
@@ -318,6 +318,8 @@ or on combining URL components into a URL string.
    a ``?`` for an empty query), only ``None`` components are omitted.
    This allows to restore the URL that was parsed with option
    ``allow_none=True``.
+   By default, *keep_empty* is true if *parts* is the result of the
+   :func:`urlparse` call with ``allow_none=True``.
 
    .. versionchanged:: 3.14
       Added the *keep_empty* parameter.
@@ -417,6 +419,8 @@ or on combining URL components into a URL string.
    a ``?`` for an empty query), only ``None`` components are omitted.
    This allows to restore the URL that was parsed with option
    ``allow_none=True``.
+   By default, *keep_empty* is true if *parts* is the result of the
+   :func:`urlsplit` call with ``allow_none=True``.
 
    .. versionchanged:: 3.14
       Added the *keep_empty* parameter.
@@ -461,10 +465,8 @@ or on combining URL components into a URL string.
 
 
    .. versionchanged:: 3.5
-      Behavior updated to match the semantics defined in :rfc:`3986`.
 
-   .. versionchanged:: 3.14
-      Added the *keep_empty* parameter.
+      Behavior updated to match the semantics defined in :rfc:`3986`.
 
 
 .. function:: urldefrag(url, *, allow_none=False)
@@ -588,12 +590,13 @@ These subclasses add the attributes listed in the documentation for
 those functions, the encoding and decoding support described in the
 previous section, as well as an additional method:
 
-.. method:: urllib.parse.SplitResult.geturl(*, keep_empty=False)
+.. method:: urllib.parse.SplitResult.geturl()
 
    Return the re-combined version of the original URL as a string. This may
    differ from the original URL in that the scheme may be normalized to lower
    case and empty components may be dropped. Specifically, empty parameters,
-   queries, and fragment identifiers will be removed unless *keep_empty* is true.
+   queries, and fragment identifiers will be removed unless the URL was parsed
+   with ``allow_none=True``.
 
    For :func:`urldefrag` results, only empty fragment identifiers will be removed.
    For :func:`urlsplit` and :func:`urlparse` results, all noted changes will be
@@ -611,11 +614,8 @@ previous section, as well as an additional method:
       >>> r2.geturl()
       'http://www.Python.org/doc/'
       >>> r3 = urlsplit(url, allow_none=True)
-      >>> r1.geturl(keep_empty=True)
-      'http://www.Python.org/doc/'
-
-   .. versionchanged:: 3.14
-      Added the *keep_empty* parameter.
+      >>> r3.geturl()
+      'http://www.Python.org/doc/#'
 
 
 The following classes provide the implementations of the structured parse
diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst
@@ -595,8 +595,9 @@ urllib.parse
 * Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`,
   :func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag` functions.
   Add the *keep_empty* parameter to :func:`~urllib.parse.urlunparse` and
-  :func:`~urllib.parse.urlunsplit` functions and
-  :func:`~urllib.parse.SplitResult.geturl` methods.
+  :func:`~urllib.parse.urlunsplit` functions.
+  This allows to distinguish between empty and not defined URI components
+  and preserve empty components.
   (Contributed by Serhiy Storchaka in :gh:`67041`.)
 
 uuid
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
@@ -3,6 +3,7 @@
 import unicodedata
 import unittest
 import urllib.parse
+from urllib.parse import urlparse, urlsplit, urlunparse, urlunsplit
 
 RFC1808_BASE = "http://a/b/c/d;p?q#f"
 RFC2396_BASE = "http://a/b/c/d;p?q"
@@ -119,23 +120,50 @@ def _encode(self, s):
             return tuple(self._encode(x) for x in s)
         return s
 
-    def checkRoundtrips(self, url, parsed, split, url2=None, *, allow_none=True):
+    def checkRoundtrips(self, url, parsed, split, url2=None):
         if url2 is None:
             url2 = url
-        result = urllib.parse.urlparse(url, allow_none=allow_none)
+        self.checkRoundtrips1(url, parsed, split, allow_none=True)
+        empty = url[:0]
+        parsed = tuple(x or empty for x in parsed)
+        split = tuple(x or empty for x in split)
+        self.checkRoundtrips1(url, parsed, split, url2, allow_none=False)
+
+        result = urlparse(url, allow_none=True)
+        self.assertEqual(urlunparse(result, keep_empty=False), url2)
+        self.assertEqual(urlunparse(tuple(result), keep_empty=False), url2)
+        result = urlparse(url, allow_none=False)
+        with self.assertRaises(ValueError):
+            urlunparse(result, keep_empty=True)
+        urlunparse(tuple(result), keep_empty=True)
+
+        result = urlsplit(url, allow_none=True)
+        self.assertEqual(urlunsplit(result, keep_empty=False), url2)
+        self.assertEqual(urlunsplit(tuple(result), keep_empty=False), url2)
+        result = urlsplit(url, allow_none=False)
+        with self.assertRaises(ValueError):
+            urlunsplit(result, keep_empty=True)
+        urlunsplit(tuple(result), keep_empty=True)
+
+    def checkRoundtrips1(self, url, parsed, split, url2=None, *, allow_none):
+        if url2 is None:
+            url2 = url
+        result = urlparse(url, allow_none=allow_none)
         self.assertSequenceEqual(result, parsed)
         t = (result.scheme, result.netloc, result.path,
-             result.params, result.query, result.fragment)
+            result.params, result.query, result.fragment)
         self.assertSequenceEqual(t, parsed)
         # put it back together and it should be the same
-        result2 = urllib.parse.urlunparse(result, keep_empty=allow_none)
-        self.assertSequenceEqual(result2, url2)
-        self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none))
+        result2 = urlunparse(result)
+        self.assertEqual(result2, url2)
+        self.assertEqual(result2, result.geturl())
+        self.assertEqual(urlunparse(result, keep_empty=allow_none), url2)
+        self.assertEqual(urlunparse(tuple(result), keep_empty=allow_none), result2)
 
         # the result of geturl() is a fixpoint; we can always parse it
         # again to get the same result:
-        result3 = urllib.parse.urlparse(result.geturl(keep_empty=allow_none), allow_none=allow_none)
-        self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none))
+        result3 = urlparse(result.geturl(), allow_none=allow_none)
+        self.assertEqual(result3.geturl(), result.geturl())
         self.assertSequenceEqual(result3, result)
         self.assertEqual(result3.scheme,   result.scheme)
         self.assertEqual(result3.netloc,   result.netloc)
@@ -149,18 +177,19 @@ def checkRoundtrips(self, url, parsed, split, url2=None, *, allow_none=True):
         self.assertEqual(result3.port,     result.port)
 
         # check the roundtrip using urlsplit() as well
-        result = urllib.parse.urlsplit(url, allow_none=allow_none)
+        result = urlsplit(url, allow_none=allow_none)
         self.assertSequenceEqual(result, split)
         t = (result.scheme, result.netloc, result.path,
-             result.query, result.fragment)
+            result.query, result.fragment)
         self.assertSequenceEqual(t, split)
-        result2 = urllib.parse.urlunsplit(result, keep_empty=allow_none)
-        self.assertSequenceEqual(result2, url2)
-        self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none))
+        result2 = urlunsplit(result)
+        self.assertEqual(result2, url2)
+        self.assertEqual(result2, result.geturl())
+        self.assertEqual(urlunsplit(tuple(result), keep_empty=allow_none), result2)
 
         # check the fixpoint property of re-parsing the result of geturl()
-        result3 = urllib.parse.urlsplit(result.geturl(keep_empty=allow_none), allow_none=allow_none)
-        self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none))
+        result3 = urlsplit(result.geturl(), allow_none=allow_none)
+        self.assertEqual(result3.geturl(), result.geturl())
         self.assertSequenceEqual(result3, result)
         self.assertEqual(result3.scheme,   result.scheme)
         self.assertEqual(result3.netloc,   result.netloc)
@@ -288,32 +317,28 @@ def test_roundtrips(self):
             ]
         for url, parsed, split in str_cases + bytes_cases:
             with self.subTest(url):
-                self.checkRoundtrips(url, parsed, split, allow_none=True)
-                empty = url[:0]
-                parsed = tuple(x or empty for x in parsed)
-                split = tuple(x or empty for x in split)
-                self.checkRoundtrips(url, parsed, split, allow_none=False)
+                self.checkRoundtrips(url, parsed, split)
 
     def test_roundtrips_normalization(self):
         str_cases = [
             ('///path/to/file',
-             '///path/to/file',
+             '/path/to/file',
              (None, '', '/path/to/file', None, None, None),
              (None, '', '/path/to/file', None, None)),
             ('scheme:///path/to/file',
-             'scheme:///path/to/file',
+             'scheme:/path/to/file',
              ('scheme', '', '/path/to/file', None, None, None),
              ('scheme', '', '/path/to/file', None, None)),
             ('file:/tmp/junk.txt',
-             'file:/tmp/junk.txt',
+             'file:///tmp/junk.txt',
              ('file', None, '/tmp/junk.txt', None, None, None),
              ('file', None, '/tmp/junk.txt', None, None)),
             ('http:/tmp/junk.txt',
-             'http:/tmp/junk.txt',
+             'http:///tmp/junk.txt',
              ('http', None, '/tmp/junk.txt', None, None, None),
              ('http', None, '/tmp/junk.txt', None, None)),
             ('https:/tmp/junk.txt',
-             'https:/tmp/junk.txt',
+             'https:///tmp/junk.txt',
              ('https', None, '/tmp/junk.txt', None, None, None),
              ('https', None, '/tmp/junk.txt', None, None)),
         ]
@@ -371,9 +396,9 @@ def checkJoin(self, base, relurl, expected, *, relroundtrip=True):
                 relurlb2 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb))
                 self.assertEqual(urllib.parse.urljoin(baseb, relurlb2), expectedb)
 
-            relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True), keep_empty=True)
+            relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True))
             self.assertEqual(urllib.parse.urljoin(base, relurl3), expected)
-            relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True), keep_empty=True)
+            relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True))
             self.assertEqual(urllib.parse.urljoin(baseb, relurlb3), expectedb)
 
     def test_unparse_parse(self):
@@ -796,7 +821,7 @@ def _encode(t):
                         url = url.rstrip(hash)
                         if frag is None:
                             frag = url[:0]
-                    self.assertEqual(result.geturl(keep_empty=allow_none), url)
+                    self.assertEqual(result.geturl(), url)
                     self.assertEqual(result, (defrag, frag))
                     self.assertEqual(result.url, defrag)
                     self.assertEqual(result.fragment, frag)
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
@@ -267,11 +267,27 @@ def _hostinfo(self):
         return hostname, port
 
 
-_DefragResultBase = namedtuple('_DefragResultBase', 'url fragment')
-_SplitResultBase = namedtuple(
-    '_SplitResultBase', 'scheme netloc path query fragment')
-_ParseResultBase = namedtuple(
-    '_ParseResultBase', 'scheme netloc path params query fragment')
+_UNSPECIFIED = ['not specified']
+_ALLOW_NONE_DEFAULT = False
+
+class _DefragResultBase(namedtuple('_DefragResultBase', 'url fragment')):
+    def geturl(self):
+        if self.fragment or (self.fragment is not None and
+                             getattr(self, '_keep_empty', _ALLOW_NONE_DEFAULT)):
+            return self.url + self._HASH + self.fragment
+        else:
+            return self.url
+
+class _SplitResultBase(namedtuple(
+    '_SplitResultBase', 'scheme netloc path query fragment')):
+    def geturl(self):
+        return urlunsplit(self)
+
+class _ParseResultBase(namedtuple(
+    '_ParseResultBase', 'scheme netloc path params query fragment')):
+    def geturl(self):
+        return urlunparse(self)
+
 
 _DefragResultBase.__doc__ = """
 DefragResult(url, fragment)
@@ -339,45 +355,27 @@ def _hostinfo(self):
 # retained since deprecating it isn't worth the hassle
 ResultBase = _NetlocResultMixinStr
 
-_ALLOW_NONE_DEFAULT = False
-
 # Structured result objects for string data
 class DefragResult(_DefragResultBase, _ResultMixinStr):
     __slots__ = ()
-    def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
-        if self.fragment or (keep_empty and self.fragment is not None):
-            return self.url + '#' + self.fragment
-        else:
-            return self.url
+    _HASH = '#'
 
 class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
     __slots__ = ()
-    def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
-        return urlunsplit(self, keep_empty=keep_empty)
 
 class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
     __slots__ = ()
-    def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
-        return urlunparse(self, keep_empty=keep_empty)
 
 # Structured result objects for bytes data
 class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
     __slots__ = ()
-    def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
-        if self.fragment or (keep_empty and self.fragment is not None):
-            return self.url + b'#' + self.fragment
-        else:
-            return self.url
+    _HASH = b'#'
 
 class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
     __slots__ = ()
-    def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
-        return urlunsplit(self, keep_empty=keep_empty)
 
 class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
     __slots__ = ()
-    def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
-        return urlunparse(self, keep_empty=keep_empty)
 
 # Set up the encode/decode result pairs
 def _fix_result_transcoding():
@@ -424,7 +422,9 @@ def urlparse(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D
         if query is None: query = ''
         if fragment is None: fragment = ''
     result = ParseResult(scheme, netloc, url, params, query, fragment)
-    return _coerce_result(result)
+    result = _coerce_result(result)
+    result._keep_empty = allow_none
+    return result
 
 def _urlparse(url, scheme=None, allow_fragments=True):
     scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments)
@@ -513,8 +513,10 @@ def urlsplit(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D
         if netloc is None: netloc = ''
         if query is None: query = ''
         if fragment is None: fragment = ''
-    v = SplitResult(scheme, netloc, url, query, fragment)
-    return _coerce_result(v)
+    result = SplitResult(scheme, netloc, url, query, fragment)
+    result = _coerce_result(result)
+    result._keep_empty = allow_none
+    return result
 
 def _urlsplit(url, scheme=None, allow_fragments=True):
     # Only lstrip url as some applications rely on preserving trailing space.
@@ -551,13 +553,20 @@ def _urlsplit(url, scheme=None, allow_fragments=True):
     _checknetloc(netloc)
     return (scheme, netloc, url, query, fragment)
 
-def urlunparse(components, *, keep_empty=_ALLOW_NONE_DEFAULT):
+def urlunparse(components, *, keep_empty=_UNSPECIFIED):
     """Put a parsed URL back together again.  This may result in a
     slightly different, but equivalent URL, if the URL that was parsed
     originally had redundant delimiters, e.g. a ? with an empty query
-    (the draft states that these are equivalent)."""
+    (the draft states that these are equivalent) and keep_empty is false
+    or components is the result of the urlparse() call with allow_none=False."""
     scheme, netloc, url, params, query, fragment, _coerce_result = (
                                                   _coerce_args(*components))
+    if keep_empty is _UNSPECIFIED:
+        keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT)
+    elif keep_empty and not getattr(components, '_keep_empty', True):
+        raise ValueError('Cannot distinguish between empty and not defined '
+                         'URI components in the result of parsing URL with '
+                         'allow_none=False')
     if not keep_empty:
         if not netloc:
             if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
@@ -572,14 +581,22 @@ def urlunparse(components, *, keep_empty=_ALLOW_NONE_DEFAULT):
         url = "%s;%s" % (url, params)
     return _coerce_result(_urlunsplit(scheme, netloc, url, query, fragment))
 
-def urlunsplit(components, *, keep_empty=_ALLOW_NONE_DEFAULT):
+def urlunsplit(components, *, keep_empty=_UNSPECIFIED):
     """Combine the elements of a tuple as returned by urlsplit() into a
     complete URL as a string. The data argument can be any five-item iterable.
     This may result in a slightly different, but equivalent URL, if the URL that
     was parsed originally had unnecessary delimiters (for example, a ? with an
-    empty query; the RFC states that these are equivalent)."""
+    empty query; the RFC states that these are equivalent) and keep_empty
+    is false or components is the result of the urlsplit() call with
+    allow_none=False."""
     scheme, netloc, url, query, fragment, _coerce_result = (
                                           _coerce_args(*components))
+    if keep_empty is _UNSPECIFIED:
+        keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT)
+    elif keep_empty and not getattr(components, '_keep_empty', True):
+        raise ValueError('Cannot distinguish between empty and not defined '
+                         'URI components in the result of parsing URL with '
+                         'allow_none=False')
     if not keep_empty:
         if not netloc:
             if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
@@ -692,7 +709,9 @@ def urldefrag(url, *, allow_none=_ALLOW_NONE_DEFAULT):
         frag = None
         defrag = url
     if not allow_none and frag is None: frag = ''
-    return _coerce_result(DefragResult(defrag, frag))
+    result = _coerce_result(DefragResult(defrag, frag))
+    result._keep_empty = allow_none
+    return result
 
 _hexdig = '0123456789ABCDEFabcdef'
 _hextobyte = None
diff --git a/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst b/Misc/NEWS.d/next/Library/2024-11-27-13-11-16.gh-issue-67041.ym2WKK.rst
@@ -0,0 +1,6 @@
+Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`,
+:func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag`
+functions. Add the *keep_empty* parameter to
+:func:`~urllib.parse.urlunparse` and :func:`~urllib.parse.urlunsplit`
+functions. This allows to distinguish between empty and not defined URI
+components and preserve empty components.