diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index aabc360289a0d0..b583ff3284bb4e 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -362,6 +362,11 @@ def checkJoin(self, base, relurl, expected, *, relroundtrip=True): self.assertEqual(urllib.parse.urljoin(base, relurl), expected) relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb)) self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) + else: + relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl)) + self.assertNotEqual(urllib.parse.urljoin(base, relurl), expected) + relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb)) + self.assertNotEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) def test_unparse_parse(self): str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',] @@ -568,6 +573,9 @@ def test_urljoins(self): # slashes self.checkJoin('http://a/b/c/d/e/', '../../f/g/', 'http://a/b/c/f/g/') self.checkJoin('http://a/b/c/d/e', '../../f/g/', 'http://a/b/f/g/') + self.checkJoin('http://a/b/c/d/e//', '../../f/g/', 'http://a/b/c/d/f/g/') + self.checkJoin('http://a/b/c/d/e///', '../../f/g/', 'http://a/b/c/d/e/f/g/') + self.checkJoin('http://a/b/c/d/e////', '../../f/g/', 'http://a/b/c/d/e//f/g/') self.checkJoin('http://a/b/c/d/e/', '/../../f/g/', 'http://a/f/g/') self.checkJoin('http://a/b/c/d/e', '/../../f/g/', 'http://a/f/g/') self.checkJoin('http://a/b/c/d/e/', '../../f/g', 'http://a/b/c/f/g') @@ -645,6 +653,16 @@ def test_urljoins_relative_base(self): self.checkJoin('//', '/w', '///w') self.checkJoin('//', '///w', '///w') self.checkJoin('//', 'w', '///w') + self.checkJoin('//', '../w', '///w') + self.checkJoin('//', './w', '///w') + self.checkJoin('//', '..//w', '///w') + self.checkJoin('//', './/w', '///w') + self.checkJoin('//', '..', '//') + self.checkJoin('//', '.', '//') + self.checkJoin('//', '../', '//') + self.checkJoin('//', './', '//') + self.checkJoin('//', '..//', '///') + self.checkJoin('//', './/', '///') self.checkJoin('//a', '', '//a') self.checkJoin('//a', '//', '//a') @@ -653,6 +671,16 @@ def test_urljoins_relative_base(self): self.checkJoin('//a', '/w', '//a/w') self.checkJoin('//a', '///w', '//a/w') self.checkJoin('//a', 'w', '//a/w') + self.checkJoin('//a', '../w', '//a/w') + self.checkJoin('//a', './w', '//a/w') + self.checkJoin('//a', '..//w', '//a/w') + self.checkJoin('//a', './/w', '//a/w') + self.checkJoin('//a', '..', '//a') + self.checkJoin('//a', '.', '//a') + self.checkJoin('//a', '../', '//a') + self.checkJoin('//a', './', '//a') + self.checkJoin('//a', '..//', '//a/') + self.checkJoin('//a', './/', '//a/') for scheme in '', 'http:': self.checkJoin('http:', scheme + '', 'http:') @@ -661,7 +689,21 @@ def test_urljoins_relative_base(self): self.checkJoin('http:', scheme + '//v/w', 'http://v/w') self.checkJoin('http:', scheme + '/w', 'http:/w') self.checkJoin('http:', scheme + '///w', 'http:/w') - self.checkJoin('http:', scheme + 'w', 'http:/w') + self.checkJoin('http:', scheme + 'w', 'http:w') + self.checkJoin('http:', scheme + '../w', 'http:w') + self.checkJoin('http:', scheme + './w', 'http:w') + self.checkJoin('http:', scheme + '..//w', 'http:/w') + self.checkJoin('http:', scheme + './/w', 'http:/w') + self.checkJoin('http:', scheme + '..///w', 'http:////w') + self.checkJoin('http:', scheme + './//w', 'http:////w') + self.checkJoin('http:', scheme + '..', 'http:') + self.checkJoin('http:', scheme + '.', 'http:') + self.checkJoin('http:', scheme + '../', 'http:') + self.checkJoin('http:', scheme + './', 'http:') + self.checkJoin('http:', scheme + '..//', 'http:/') + self.checkJoin('http:', scheme + './/', 'http:/') + self.checkJoin('http:', scheme + '..///', 'http:////') + self.checkJoin('http:', scheme + './//', 'http:////') self.checkJoin('http://', scheme + '', 'http://') self.checkJoin('http://', scheme + '//', 'http://') @@ -670,6 +712,20 @@ def test_urljoins_relative_base(self): self.checkJoin('http://', scheme + '/w', 'http:///w') self.checkJoin('http://', scheme + '///w', 'http:///w') self.checkJoin('http://', scheme + 'w', 'http:///w') + self.checkJoin('http://', scheme + '../w', 'http:///w') + self.checkJoin('http://', scheme + './w', 'http:///w') + self.checkJoin('http://', scheme + '..//w', 'http:///w') + self.checkJoin('http://', scheme + './/w', 'http:///w') + self.checkJoin('http://', scheme + '..///w', 'http:////w') + self.checkJoin('http://', scheme + './//w', 'http:////w') + self.checkJoin('http://', scheme + '..', 'http://') + self.checkJoin('http://', scheme + '.', 'http://') + self.checkJoin('http://', scheme + '../', 'http://') + self.checkJoin('http://', scheme + './', 'http://') + self.checkJoin('http://', scheme + '..//', 'http:///') + self.checkJoin('http://', scheme + './/', 'http:///') + self.checkJoin('http://', scheme + '..///', 'http:////') + self.checkJoin('http://', scheme + './//', 'http:////') self.checkJoin('http://a', scheme + '', 'http://a') self.checkJoin('http://a', scheme + '//', 'http://a') @@ -678,6 +734,38 @@ def test_urljoins_relative_base(self): self.checkJoin('http://a', scheme + '/w', 'http://a/w') self.checkJoin('http://a', scheme + '///w', 'http://a/w') self.checkJoin('http://a', scheme + 'w', 'http://a/w') + self.checkJoin('http://a', scheme + '../w', 'http://a/w') + self.checkJoin('http://a', scheme + './w', 'http://a/w') + self.checkJoin('http://a', scheme + '..//w', 'http://a/w') + self.checkJoin('http://a', scheme + './/w', 'http://a/w') + self.checkJoin('http://a', scheme + '..///w', 'http://a//w') + self.checkJoin('http://a', scheme + './//w', 'http://a//w') + self.checkJoin('http://a', scheme + '..', 'http://a') + self.checkJoin('http://a', scheme + '.', 'http://a') + self.checkJoin('http://a', scheme + '../', 'http://a') + self.checkJoin('http://a', scheme + './', 'http://a') + self.checkJoin('http://a', scheme + '..//', 'http://a/') + self.checkJoin('http://a', scheme + './/', 'http://a/') + self.checkJoin('http://a', scheme + '..///', 'http://a//') + self.checkJoin('http://a', scheme + './//', 'http://a//') + + self.checkJoin('b/c', '', 'b/c') + self.checkJoin('b/c', '//', 'b/c') + self.checkJoin('b/c', '//v', '//v') + self.checkJoin('b/c', '//v/w', '//v/w') + self.checkJoin('b/c', '/w', '/w') + self.checkJoin('b/c', '///w', '/w') + self.checkJoin('b/c', 'w', 'b/w') + self.checkJoin('b/c', '../w', 'w') + self.checkJoin('b/c', '../../w', 'w') + self.checkJoin('b/c', '../../../w', 'w') + self.checkJoin('b/c', 'w/.', 'b/w/') + self.checkJoin('b/c', '../w/.', 'w/') + self.checkJoin('b/c', '../../w/.', 'w/') + self.checkJoin('b/c', '../../../w/.', 'w/') + self.checkJoin('b/c', '..', '') + self.checkJoin('b/c', '../..', '') + self.checkJoin('b/c', '../../..', '') self.checkJoin('/b/c', '', '/b/c') self.checkJoin('/b/c', '//', '/b/c') @@ -686,6 +774,16 @@ def test_urljoins_relative_base(self): self.checkJoin('/b/c', '/w', '/w') self.checkJoin('/b/c', '///w', '/w') self.checkJoin('/b/c', 'w', '/b/w') + self.checkJoin('/b/c', '../w', '/w') + self.checkJoin('/b/c', '../../w', '/w') + self.checkJoin('/b/c', '../../../w', '/w') + self.checkJoin('/b/c', 'w/.', '/b/w/') + self.checkJoin('/b/c', '../w/.', '/w/') + self.checkJoin('/b/c', '../../w/.', '/w/') + self.checkJoin('/b/c', '../../../w/.', '/w/') + self.checkJoin('/b/c', '..', '/') + self.checkJoin('/b/c', '../..', '/') + self.checkJoin('/b/c', '../../..', '/') self.checkJoin('///b/c', '', '///b/c') self.checkJoin('///b/c', '//', '///b/c') diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 9d51f4c6812b57..5ce9da4e30c1a9 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -610,31 +610,22 @@ def urljoin(base, url, allow_fragments=True): return _coerce_result(_urlunsplit(scheme, netloc, path, query, fragment)) - base_parts = bpath.split('/') - if base_parts[-1] != '': - # the last item is not a directory, so will not be taken into account - # in resolving the relative path - del base_parts[-1] - # for rfc3986, ignore all base path should the first character be root. - if path[:1] == '/': - segments = path.split('/') - else: - segments = base_parts + path.split('/') - # filter out elements that would cause redundant slashes on re-joining - # the resolved_path - segments[1:-1] = filter(None, segments[1:-1]) + if path[:1] != '/' and '/' in bpath: + path = bpath.rsplit('/', 1)[0] + '/' + path - resolved_path = [] + path = _remove_dot_segments(path) + return _coerce_result(_urlunsplit(scheme, netloc, path, query, fragment)) +def _remove_dot_segments(path): + segments = path.split('/') + min_len = 0 if segments[0] else 1 + + resolved_path = [] for seg in segments: if seg == '..': - try: + if len(resolved_path) > min_len: resolved_path.pop() - except IndexError: - # ignore any .. segments that would otherwise cause an IndexError - # when popped from resolved_path if resolving for rfc3986 - pass elif seg == '.': continue else: @@ -645,9 +636,7 @@ def urljoin(base, url, allow_fragments=True): # then we need to append the trailing '/' resolved_path.append('') - return _coerce_result(_urlunsplit(scheme, netloc, '/'.join( - resolved_path) or '/', query, fragment)) - + return '/'.join(resolved_path) def urldefrag(url): """Removes any existing fragment from URL. diff --git a/Misc/NEWS.d/next/Library/2024-11-11-13-05-38.gh-issue-69589.7Un1Ua.rst b/Misc/NEWS.d/next/Library/2024-11-11-13-05-38.gh-issue-69589.7Un1Ua.rst new file mode 100644 index 00000000000000..cebdbed7495c0d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-11-13-05-38.gh-issue-69589.7Un1Ua.rst @@ -0,0 +1,2 @@ +Fix `urllib.parse.urljoin` for the case when the base path is relative +and the relative reference path starts with '..'. diff --git a/Misc/NEWS.d/next/Library/2024-11-11-13-33-56.gh-issue-84774.KxbM6e.rst b/Misc/NEWS.d/next/Library/2024-11-11-13-33-56.gh-issue-84774.KxbM6e.rst new file mode 100644 index 00000000000000..6781a69a63f8ab --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-11-13-33-56.gh-issue-84774.KxbM6e.rst @@ -0,0 +1 @@ +Preserve double slashes in the path in :func:`urllib.parse.urljoin`.