Skip to content

Commit eaa9ce6

Browse files
Preserve the status of allow_none in results.
1 parent b50b778 commit eaa9ce6

File tree

5 files changed

+124
-73
lines changed

5 files changed

+124
-73
lines changed

Doc/library/urllib.parse.rst

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,8 @@ or on combining URL components into a URL string.
318318
a ``?`` for an empty query), only ``None`` components are omitted.
319319
This allows to restore the URL that was parsed with option
320320
``allow_none=True``.
321+
By default, *keep_empty* is true if *parts* is the result of the
322+
:func:`urlparse` call with ``allow_none=True``.
321323

322324
.. versionchanged:: 3.14
323325
Added the *keep_empty* parameter.
@@ -417,6 +419,8 @@ or on combining URL components into a URL string.
417419
a ``?`` for an empty query), only ``None`` components are omitted.
418420
This allows to restore the URL that was parsed with option
419421
``allow_none=True``.
422+
By default, *keep_empty* is true if *parts* is the result of the
423+
:func:`urlsplit` call with ``allow_none=True``.
420424

421425
.. versionchanged:: 3.14
422426
Added the *keep_empty* parameter.
@@ -461,10 +465,8 @@ or on combining URL components into a URL string.
461465

462466

463467
.. versionchanged:: 3.5
464-
Behavior updated to match the semantics defined in :rfc:`3986`.
465468

466-
.. versionchanged:: 3.14
467-
Added the *keep_empty* parameter.
469+
Behavior updated to match the semantics defined in :rfc:`3986`.
468470

469471

470472
.. function:: urldefrag(url, *, allow_none=False)
@@ -588,12 +590,13 @@ These subclasses add the attributes listed in the documentation for
588590
those functions, the encoding and decoding support described in the
589591
previous section, as well as an additional method:
590592

591-
.. method:: urllib.parse.SplitResult.geturl(*, keep_empty=False)
593+
.. method:: urllib.parse.SplitResult.geturl()
592594

593595
Return the re-combined version of the original URL as a string. This may
594596
differ from the original URL in that the scheme may be normalized to lower
595597
case and empty components may be dropped. Specifically, empty parameters,
596-
queries, and fragment identifiers will be removed unless *keep_empty* is true.
598+
queries, and fragment identifiers will be removed unless the URL was parsed
599+
with ``allow_none=True``.
597600

598601
For :func:`urldefrag` results, only empty fragment identifiers will be removed.
599602
For :func:`urlsplit` and :func:`urlparse` results, all noted changes will be
@@ -611,11 +614,8 @@ previous section, as well as an additional method:
611614
>>> r2.geturl()
612615
'http://www.Python.org/doc/'
613616
>>> r3 = urlsplit(url, allow_none=True)
614-
>>> r1.geturl(keep_empty=True)
615-
'http://www.Python.org/doc/'
616-
617-
.. versionchanged:: 3.14
618-
Added the *keep_empty* parameter.
617+
>>> r3.geturl()
618+
'http://www.Python.org/doc/#'
619619

620620

621621
The following classes provide the implementations of the structured parse

Doc/whatsnew/3.14.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -595,8 +595,9 @@ urllib.parse
595595
* Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`,
596596
:func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag` functions.
597597
Add the *keep_empty* parameter to :func:`~urllib.parse.urlunparse` and
598-
:func:`~urllib.parse.urlunsplit` functions and
599-
:func:`~urllib.parse.SplitResult.geturl` methods.
598+
:func:`~urllib.parse.urlunsplit` functions.
599+
This allows to distinguish between empty and not defined URI components
600+
and preserve empty components.
600601
(Contributed by Serhiy Storchaka in :gh:`67041`.)
601602

602603
uuid

Lib/test/test_urlparse.py

Lines changed: 53 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import unicodedata
44
import unittest
55
import urllib.parse
6+
from urllib.parse import urlparse, urlsplit, urlunparse, urlunsplit
67

78
RFC1808_BASE = "http://a/b/c/d;p?q#f"
89
RFC2396_BASE = "http://a/b/c/d;p?q"
@@ -119,23 +120,50 @@ def _encode(self, s):
119120
return tuple(self._encode(x) for x in s)
120121
return s
121122

122-
def checkRoundtrips(self, url, parsed, split, url2=None, *, allow_none=True):
123+
def checkRoundtrips(self, url, parsed, split, url2=None):
123124
if url2 is None:
124125
url2 = url
125-
result = urllib.parse.urlparse(url, allow_none=allow_none)
126+
self.checkRoundtrips1(url, parsed, split, allow_none=True)
127+
empty = url[:0]
128+
parsed = tuple(x or empty for x in parsed)
129+
split = tuple(x or empty for x in split)
130+
self.checkRoundtrips1(url, parsed, split, url2, allow_none=False)
131+
132+
result = urlparse(url, allow_none=True)
133+
self.assertEqual(urlunparse(result, keep_empty=False), url2)
134+
self.assertEqual(urlunparse(tuple(result), keep_empty=False), url2)
135+
result = urlparse(url, allow_none=False)
136+
with self.assertRaises(ValueError):
137+
urlunparse(result, keep_empty=True)
138+
urlunparse(tuple(result), keep_empty=True)
139+
140+
result = urlsplit(url, allow_none=True)
141+
self.assertEqual(urlunsplit(result, keep_empty=False), url2)
142+
self.assertEqual(urlunsplit(tuple(result), keep_empty=False), url2)
143+
result = urlsplit(url, allow_none=False)
144+
with self.assertRaises(ValueError):
145+
urlunsplit(result, keep_empty=True)
146+
urlunsplit(tuple(result), keep_empty=True)
147+
148+
def checkRoundtrips1(self, url, parsed, split, url2=None, *, allow_none):
149+
if url2 is None:
150+
url2 = url
151+
result = urlparse(url, allow_none=allow_none)
126152
self.assertSequenceEqual(result, parsed)
127153
t = (result.scheme, result.netloc, result.path,
128-
result.params, result.query, result.fragment)
154+
result.params, result.query, result.fragment)
129155
self.assertSequenceEqual(t, parsed)
130156
# put it back together and it should be the same
131-
result2 = urllib.parse.urlunparse(result, keep_empty=allow_none)
132-
self.assertSequenceEqual(result2, url2)
133-
self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none))
157+
result2 = urlunparse(result)
158+
self.assertEqual(result2, url2)
159+
self.assertEqual(result2, result.geturl())
160+
self.assertEqual(urlunparse(result, keep_empty=allow_none), url2)
161+
self.assertEqual(urlunparse(tuple(result), keep_empty=allow_none), result2)
134162

135163
# the result of geturl() is a fixpoint; we can always parse it
136164
# again to get the same result:
137-
result3 = urllib.parse.urlparse(result.geturl(keep_empty=allow_none), allow_none=allow_none)
138-
self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none))
165+
result3 = urlparse(result.geturl(), allow_none=allow_none)
166+
self.assertEqual(result3.geturl(), result.geturl())
139167
self.assertSequenceEqual(result3, result)
140168
self.assertEqual(result3.scheme, result.scheme)
141169
self.assertEqual(result3.netloc, result.netloc)
@@ -149,18 +177,19 @@ def checkRoundtrips(self, url, parsed, split, url2=None, *, allow_none=True):
149177
self.assertEqual(result3.port, result.port)
150178

151179
# check the roundtrip using urlsplit() as well
152-
result = urllib.parse.urlsplit(url, allow_none=allow_none)
180+
result = urlsplit(url, allow_none=allow_none)
153181
self.assertSequenceEqual(result, split)
154182
t = (result.scheme, result.netloc, result.path,
155-
result.query, result.fragment)
183+
result.query, result.fragment)
156184
self.assertSequenceEqual(t, split)
157-
result2 = urllib.parse.urlunsplit(result, keep_empty=allow_none)
158-
self.assertSequenceEqual(result2, url2)
159-
self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none))
185+
result2 = urlunsplit(result)
186+
self.assertEqual(result2, url2)
187+
self.assertEqual(result2, result.geturl())
188+
self.assertEqual(urlunsplit(tuple(result), keep_empty=allow_none), result2)
160189

161190
# check the fixpoint property of re-parsing the result of geturl()
162-
result3 = urllib.parse.urlsplit(result.geturl(keep_empty=allow_none), allow_none=allow_none)
163-
self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none))
191+
result3 = urlsplit(result.geturl(), allow_none=allow_none)
192+
self.assertEqual(result3.geturl(), result.geturl())
164193
self.assertSequenceEqual(result3, result)
165194
self.assertEqual(result3.scheme, result.scheme)
166195
self.assertEqual(result3.netloc, result.netloc)
@@ -288,32 +317,28 @@ def test_roundtrips(self):
288317
]
289318
for url, parsed, split in str_cases + bytes_cases:
290319
with self.subTest(url):
291-
self.checkRoundtrips(url, parsed, split, allow_none=True)
292-
empty = url[:0]
293-
parsed = tuple(x or empty for x in parsed)
294-
split = tuple(x or empty for x in split)
295-
self.checkRoundtrips(url, parsed, split, allow_none=False)
320+
self.checkRoundtrips(url, parsed, split)
296321

297322
def test_roundtrips_normalization(self):
298323
str_cases = [
299324
('///path/to/file',
300-
'///path/to/file',
325+
'/path/to/file',
301326
(None, '', '/path/to/file', None, None, None),
302327
(None, '', '/path/to/file', None, None)),
303328
('scheme:///path/to/file',
304-
'scheme:///path/to/file',
329+
'scheme:/path/to/file',
305330
('scheme', '', '/path/to/file', None, None, None),
306331
('scheme', '', '/path/to/file', None, None)),
307332
('file:/tmp/junk.txt',
308-
'file:/tmp/junk.txt',
333+
'file:///tmp/junk.txt',
309334
('file', None, '/tmp/junk.txt', None, None, None),
310335
('file', None, '/tmp/junk.txt', None, None)),
311336
('http:/tmp/junk.txt',
312-
'http:/tmp/junk.txt',
337+
'http:///tmp/junk.txt',
313338
('http', None, '/tmp/junk.txt', None, None, None),
314339
('http', None, '/tmp/junk.txt', None, None)),
315340
('https:/tmp/junk.txt',
316-
'https:/tmp/junk.txt',
341+
'https:///tmp/junk.txt',
317342
('https', None, '/tmp/junk.txt', None, None, None),
318343
('https', None, '/tmp/junk.txt', None, None)),
319344
]
@@ -371,9 +396,9 @@ def checkJoin(self, base, relurl, expected, *, relroundtrip=True):
371396
relurlb2 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb))
372397
self.assertEqual(urllib.parse.urljoin(baseb, relurlb2), expectedb)
373398

374-
relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True), keep_empty=True)
399+
relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True))
375400
self.assertEqual(urllib.parse.urljoin(base, relurl3), expected)
376-
relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True), keep_empty=True)
401+
relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True))
377402
self.assertEqual(urllib.parse.urljoin(baseb, relurlb3), expectedb)
378403

379404
def test_unparse_parse(self):
@@ -796,7 +821,7 @@ def _encode(t):
796821
url = url.rstrip(hash)
797822
if frag is None:
798823
frag = url[:0]
799-
self.assertEqual(result.geturl(keep_empty=allow_none), url)
824+
self.assertEqual(result.geturl(), url)
800825
self.assertEqual(result, (defrag, frag))
801826
self.assertEqual(result.url, defrag)
802827
self.assertEqual(result.fragment, frag)

Lib/urllib/parse.py

Lines changed: 52 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -267,11 +267,27 @@ def _hostinfo(self):
267267
return hostname, port
268268

269269

270-
_DefragResultBase = namedtuple('_DefragResultBase', 'url fragment')
271-
_SplitResultBase = namedtuple(
272-
'_SplitResultBase', 'scheme netloc path query fragment')
273-
_ParseResultBase = namedtuple(
274-
'_ParseResultBase', 'scheme netloc path params query fragment')
270+
_UNSPECIFIED = ['not specified']
271+
_ALLOW_NONE_DEFAULT = False
272+
273+
class _DefragResultBase(namedtuple('_DefragResultBase', 'url fragment')):
274+
def geturl(self):
275+
if self.fragment or (self.fragment is not None and
276+
getattr(self, '_keep_empty', _ALLOW_NONE_DEFAULT)):
277+
return self.url + self._HASH + self.fragment
278+
else:
279+
return self.url
280+
281+
class _SplitResultBase(namedtuple(
282+
'_SplitResultBase', 'scheme netloc path query fragment')):
283+
def geturl(self):
284+
return urlunsplit(self)
285+
286+
class _ParseResultBase(namedtuple(
287+
'_ParseResultBase', 'scheme netloc path params query fragment')):
288+
def geturl(self):
289+
return urlunparse(self)
290+
275291

276292
_DefragResultBase.__doc__ = """
277293
DefragResult(url, fragment)
@@ -339,45 +355,27 @@ def _hostinfo(self):
339355
# retained since deprecating it isn't worth the hassle
340356
ResultBase = _NetlocResultMixinStr
341357

342-
_ALLOW_NONE_DEFAULT = False
343-
344358
# Structured result objects for string data
345359
class DefragResult(_DefragResultBase, _ResultMixinStr):
346360
__slots__ = ()
347-
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
348-
if self.fragment or (keep_empty and self.fragment is not None):
349-
return self.url + '#' + self.fragment
350-
else:
351-
return self.url
361+
_HASH = '#'
352362

353363
class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
354364
__slots__ = ()
355-
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
356-
return urlunsplit(self, keep_empty=keep_empty)
357365

358366
class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
359367
__slots__ = ()
360-
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
361-
return urlunparse(self, keep_empty=keep_empty)
362368

363369
# Structured result objects for bytes data
364370
class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
365371
__slots__ = ()
366-
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
367-
if self.fragment or (keep_empty and self.fragment is not None):
368-
return self.url + b'#' + self.fragment
369-
else:
370-
return self.url
372+
_HASH = b'#'
371373

372374
class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
373375
__slots__ = ()
374-
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
375-
return urlunsplit(self, keep_empty=keep_empty)
376376

377377
class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
378378
__slots__ = ()
379-
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
380-
return urlunparse(self, keep_empty=keep_empty)
381379

382380
# Set up the encode/decode result pairs
383381
def _fix_result_transcoding():
@@ -424,7 +422,9 @@ def urlparse(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D
424422
if query is None: query = ''
425423
if fragment is None: fragment = ''
426424
result = ParseResult(scheme, netloc, url, params, query, fragment)
427-
return _coerce_result(result)
425+
result = _coerce_result(result)
426+
result._keep_empty = allow_none
427+
return result
428428

429429
def _urlparse(url, scheme=None, allow_fragments=True):
430430
scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments)
@@ -513,8 +513,10 @@ def urlsplit(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D
513513
if netloc is None: netloc = ''
514514
if query is None: query = ''
515515
if fragment is None: fragment = ''
516-
v = SplitResult(scheme, netloc, url, query, fragment)
517-
return _coerce_result(v)
516+
result = SplitResult(scheme, netloc, url, query, fragment)
517+
result = _coerce_result(result)
518+
result._keep_empty = allow_none
519+
return result
518520

519521
def _urlsplit(url, scheme=None, allow_fragments=True):
520522
# Only lstrip url as some applications rely on preserving trailing space.
@@ -551,13 +553,20 @@ def _urlsplit(url, scheme=None, allow_fragments=True):
551553
_checknetloc(netloc)
552554
return (scheme, netloc, url, query, fragment)
553555

554-
def urlunparse(components, *, keep_empty=_ALLOW_NONE_DEFAULT):
556+
def urlunparse(components, *, keep_empty=_UNSPECIFIED):
555557
"""Put a parsed URL back together again. This may result in a
556558
slightly different, but equivalent URL, if the URL that was parsed
557559
originally had redundant delimiters, e.g. a ? with an empty query
558-
(the draft states that these are equivalent)."""
560+
(the draft states that these are equivalent) and keep_empty is false
561+
or components is the result of the urlparse() call with allow_none=False."""
559562
scheme, netloc, url, params, query, fragment, _coerce_result = (
560563
_coerce_args(*components))
564+
if keep_empty is _UNSPECIFIED:
565+
keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT)
566+
elif keep_empty and not getattr(components, '_keep_empty', True):
567+
raise ValueError('Cannot distinguish between empty and not defined '
568+
'URI components in the result of parsing URL with '
569+
'allow_none=False')
561570
if not keep_empty:
562571
if not netloc:
563572
if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
@@ -572,14 +581,22 @@ def urlunparse(components, *, keep_empty=_ALLOW_NONE_DEFAULT):
572581
url = "%s;%s" % (url, params)
573582
return _coerce_result(_urlunsplit(scheme, netloc, url, query, fragment))
574583

575-
def urlunsplit(components, *, keep_empty=_ALLOW_NONE_DEFAULT):
584+
def urlunsplit(components, *, keep_empty=_UNSPECIFIED):
576585
"""Combine the elements of a tuple as returned by urlsplit() into a
577586
complete URL as a string. The data argument can be any five-item iterable.
578587
This may result in a slightly different, but equivalent URL, if the URL that
579588
was parsed originally had unnecessary delimiters (for example, a ? with an
580-
empty query; the RFC states that these are equivalent)."""
589+
empty query; the RFC states that these are equivalent) and keep_empty
590+
is false or components is the result of the urlsplit() call with
591+
allow_none=False."""
581592
scheme, netloc, url, query, fragment, _coerce_result = (
582593
_coerce_args(*components))
594+
if keep_empty is _UNSPECIFIED:
595+
keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT)
596+
elif keep_empty and not getattr(components, '_keep_empty', True):
597+
raise ValueError('Cannot distinguish between empty and not defined '
598+
'URI components in the result of parsing URL with '
599+
'allow_none=False')
583600
if not keep_empty:
584601
if not netloc:
585602
if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
@@ -692,7 +709,9 @@ def urldefrag(url, *, allow_none=_ALLOW_NONE_DEFAULT):
692709
frag = None
693710
defrag = url
694711
if not allow_none and frag is None: frag = ''
695-
return _coerce_result(DefragResult(defrag, frag))
712+
result = _coerce_result(DefragResult(defrag, frag))
713+
result._keep_empty = allow_none
714+
return result
696715

697716
_hexdig = '0123456789ABCDEFabcdef'
698717
_hextobyte = None
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`,
2+
:func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag`
3+
functions. Add the *keep_empty* parameter to
4+
:func:`~urllib.parse.urlunparse` and :func:`~urllib.parse.urlunsplit`
5+
functions. This allows to distinguish between empty and not defined URI
6+
components and preserve empty components.

0 commit comments

Comments
 (0)