diff --git a/news/13501.bugfix.rst b/news/13501.bugfix.rst new file mode 100644 index 00000000000..cb06bbb90d1 --- /dev/null +++ b/news/13501.bugfix.rst @@ -0,0 +1 @@ +Make conversion of file URLs more consistent across Python versions. diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 2e2c0f836ac..f1bacd7bca1 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -121,43 +121,47 @@ def _clean_url_path_part(part: str) -> str: return urllib.parse.quote(urllib.parse.unquote(part)) -def _clean_file_url_path(part: str) -> str: +def _clean_file_url(url: str) -> str: """ - Clean the first part of a URL path that corresponds to a local + Clean a URL that corresponds to a local filesystem path (i.e. the first part after splitting on "@" characters). """ + # Replace "@" characters to protect them from percent-encoding. + at_symbol_token = "---PIP_AT_SYMBOL---" + assert at_symbol_token not in url + url = url.replace("@", at_symbol_token) + parts = urllib.parse.urlsplit(url) + # We unquote prior to quoting to make sure nothing is double quoted. # Also, on Windows the path part might contain a drive letter which # should not be quoted. On Linux where drive letters do not - # exist, the colon should be quoted. We rely on urllib.request - # to do the right thing here. - ret = urllib.request.pathname2url(urllib.request.url2pathname(part)) - if ret.startswith("///"): - # Remove any URL authority section, leaving only the URL path. - ret = ret.removeprefix("//") - return ret + # exist, the colon should be quoted. + tidy_url = path_to_url(url_to_path(url), normalize_path=False) + tidy_parts = urllib.parse.urlsplit(tidy_url) + + # Restore the original scheme, query and fragment components. + url = urllib.parse.urlunsplit(tidy_parts[:3] + parts[3:]) + url = url.replace(tidy_parts.scheme, parts.scheme, 1) + + # Restore "@" characters that were replaced earlier. + return url.replace(at_symbol_token, "@") # percent-encoded: / _reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE) -def _clean_url_path(path: str, is_local_path: bool) -> str: +def _clean_url_path(path: str) -> str: """ Clean the path portion of a URL. """ - if is_local_path: - clean_func = _clean_file_url_path - else: - clean_func = _clean_url_path_part - # Split on the reserved characters prior to cleaning so that # revision strings in VCS URLs are properly preserved. parts = _reserved_chars_re.split(path) cleaned_parts = [] for to_clean, reserved in pairwise(itertools.chain(parts, [""])): - cleaned_parts.append(clean_func(to_clean)) + cleaned_parts.append(_clean_url_path_part(to_clean)) # Normalize %xx escapes (e.g. %2f -> %2F) cleaned_parts.append(reserved.upper()) @@ -174,13 +178,10 @@ def _ensure_quoted_url(url: str) -> str: # `scheme://netloc/path?query#fragment`. result = urllib.parse.urlsplit(url) # If the netloc is empty, then the URL refers to a local filesystem path. - is_local_path = not result.netloc - path = _clean_url_path(result.path, is_local_path=is_local_path) - # Temporarily replace scheme with file to ensure the URL generated by - # urlunsplit() contains an empty netloc (file://) as per RFC 1738. - ret = urllib.parse.urlunsplit(result._replace(scheme="file", path=path)) - ret = result.scheme + ret[4:] # Restore original scheme. - return ret + if not result.netloc: + return _clean_file_url(url) + path = _clean_url_path(result.path) + return urllib.parse.urlunsplit(result._replace(path=path)) def _absolute_link_url(base_url: str, url: str) -> str: diff --git a/src/pip/_internal/utils/urls.py b/src/pip/_internal/utils/urls.py index e951a5e4e47..e8f3cb25b5b 100644 --- a/src/pip/_internal/utils/urls.py +++ b/src/pip/_internal/utils/urls.py @@ -1,55 +1,70 @@ import os -import string +import sys import urllib.parse -import urllib.request from .compat import WINDOWS -def path_to_url(path: str) -> str: +def path_to_url(path: str, normalize_path: bool = True) -> str: """ - Convert a path to a file: URL. The path will be made absolute and have - quoted path parts. + Convert a path to a file: URL with quoted path parts. The path will be + normalized and made absolute if *normalize_path* is true (the default.) """ - path = os.path.normpath(os.path.abspath(path)) - url = urllib.parse.urljoin("file://", urllib.request.pathname2url(path)) - return url + if normalize_path: + path = os.path.abspath(path) + if WINDOWS: + path = path.replace("\\", "/") + + drive, tail = os.path.splitdrive(path) + if drive: + if drive[:4] == "//?/": + drive = drive[4:] + if drive[:4].upper() == "UNC/": + drive = "//" + drive[4:] + if drive[1:] == ":": + drive = "///" + drive + elif tail.startswith("/"): + tail = "//" + tail + + encoding = sys.getfilesystemencoding() + errors = sys.getfilesystemencodeerrors() + drive = urllib.parse.quote(drive, "/:", encoding, errors) + tail = urllib.parse.quote(tail, "/", encoding, errors) + return "file:" + drive + tail def url_to_path(url: str) -> str: """ Convert a file: URL to a path. """ - assert url.startswith( - "file:" + scheme, netloc, path = urllib.parse.urlsplit(url)[:3] + assert scheme == "file" or scheme.endswith( + "+file" ), f"You can only turn file: urls into filenames (not {url!r})" - _, netloc, path, _, _ = urllib.parse.urlsplit(url) + if WINDOWS: + # e.g. file://c:/foo + if netloc[1:2] == ":": + path = netloc + path + + # e.g. file://server/share/foo + elif netloc and netloc != "localhost": + path = "//" + netloc + path + + # e.g. file://///server/share/foo + elif path[:3] == "///": + path = path[1:] + + # e.g. file:///c:/foo + elif path[:1] == "/" and path[2:3] == ":": + path = path[1:] - if not netloc or netloc == "localhost": - # According to RFC 8089, same as empty authority. - netloc = "" - elif WINDOWS: - # If we have a UNC path, prepend UNC share notation. - netloc = "\\\\" + netloc - else: + path = path.replace("/", "\\") + elif netloc and netloc != "localhost": raise ValueError( f"non-local file URIs are not supported on this platform: {url!r}" ) - path = urllib.request.url2pathname(netloc + path) - - # On Windows, urlsplit parses the path as something like "/C:/Users/foo". - # This creates issues for path-related functions like io.open(), so we try - # to detect and strip the leading slash. - if ( - WINDOWS - and not netloc # Not UNC. - and len(path) >= 3 - and path[0] == "/" # Leading slash to strip. - and path[1] in string.ascii_letters # Drive letter. - and path[2:4] in (":", ":/") # Colon + end of string, or colon + absolute path. - ): - path = path[1:] - - return path + encoding = sys.getfilesystemencoding() + errors = sys.getfilesystemencodeerrors() + return urllib.parse.unquote(path, encoding, errors) diff --git a/tests/lib/__init__.py b/tests/lib/__init__.py index 78fe3604480..01a5955572c 100644 --- a/tests/lib/__init__.py +++ b/tests/lib/__init__.py @@ -16,7 +16,6 @@ from io import BytesIO, StringIO from textwrap import dedent from typing import Any, AnyStr, Callable, Literal, Protocol, Union, cast -from urllib.request import pathname2url from zipfile import ZipFile import pytest @@ -1364,19 +1363,3 @@ def __call__( CertFactory = Callable[[], str] - -# ------------------------------------------------------------------------- -# Accommodations for Windows path and URL changes in recent Python releases -# ------------------------------------------------------------------------- - -# Trailing slashes are now preserved on Windows, matching POSIX behaviour. -# BPO: https://github.com/python/cpython/issues/126212 -does_pathname2url_preserve_trailing_slash = pathname2url("C:\\foo\\").endswith("/") -skip_needs_new_pathname2url_trailing_slash_behavior_win = pytest.mark.skipif( - sys.platform != "win32" or not does_pathname2url_preserve_trailing_slash, - reason="testing windows (pathname2url) behavior for newer CPython", -) -skip_needs_old_pathname2url_trailing_slash_behavior_win = pytest.mark.skipif( - sys.platform != "win32" or does_pathname2url_preserve_trailing_slash, - reason="testing windows (pathname2url) behavior for older CPython", -) diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index fa688f8e42f..318e6ef8a81 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -33,17 +33,13 @@ Link, LinkHash, MetadataFile, + _clean_file_url, _clean_url_path, _ensure_quoted_url, ) from pip._internal.network.session import PipSession -from tests.lib import ( - TestData, - make_test_link_collector, - skip_needs_new_pathname2url_trailing_slash_behavior_win, - skip_needs_old_pathname2url_trailing_slash_behavior_win, -) +from tests.lib import TestData, make_test_link_collector ACCEPT = ", ".join( [ @@ -296,31 +292,30 @@ def test_get_simple_response_dont_log_clear_text_password( ("a %2f b", "a%20%2F%20b"), ], ) -@pytest.mark.parametrize("is_local_path", [True, False]) -def test_clean_url_path(path: str, expected: str, is_local_path: bool) -> None: - assert _clean_url_path(path, is_local_path=is_local_path) == expected +def test_clean_url_path(path: str, expected: str) -> None: + assert _clean_url_path(path) == expected @pytest.mark.parametrize( - "path, expected", + "url, expected", [ # Test a VCS path with a Windows drive letter and revision. pytest.param( - "/T:/with space/repo.git@1.0", - "/T:/with%20space/repo.git@1.0", + "file:/T:/with space/repo.git@1.0", + "file:///T:/with%20space/repo.git@1.0", marks=pytest.mark.skipif("sys.platform != 'win32'"), ), # Test a VCS path with a Windows drive letter and revision, # running on non-windows platform. pytest.param( - "/T:/with space/repo.git@1.0", - "/T%3A/with%20space/repo.git@1.0", + "file:/T:/with space/repo.git@1.0", + "file:///T%3A/with%20space/repo.git@1.0", marks=pytest.mark.skipif("sys.platform == 'win32'"), ), ], ) -def test_clean_url_path_with_local_path(path: str, expected: str) -> None: - actual = _clean_url_path(path, is_local_path=True) +def test_clean_file_url(url: str, expected: str) -> None: + actual = _clean_file_url(url) assert actual == expected @@ -387,16 +382,11 @@ def test_clean_url_path_with_local_path(path: str, expected: str) -> None: ), # URL with Windows drive letter. The `:` after the drive # letter should not be quoted. The trailing `/` should be - # removed. - pytest.param( - "file:///T:/path/with spaces/", - "file:///T:/path/with%20spaces", - marks=skip_needs_old_pathname2url_trailing_slash_behavior_win, - ), + # retained. pytest.param( "file:///T:/path/with spaces/", "file:///T:/path/with%20spaces/", - marks=skip_needs_new_pathname2url_trailing_slash_behavior_win, + marks=pytest.mark.skipif("sys.platform != 'win32'"), ), # URL with Windows drive letter, running on non-windows # platform. The `:` after the drive should be quoted.