From 23318316757e36b6d1c1ffe0f22a3119710f81e8 Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 21 Jul 2025 20:23:37 +0100 Subject: [PATCH 1/6] Stop working around old Git bug Remove code from 2011 that claims to work around a Git bug. Unfortunately the cited link is dead, and the tests all pass with the code removed. Whatever the bug was, it's likely to have been fixed in the last 14 years. --- ...395-3424-4731-a909-2ce5e029d613.trivial.rst | 0 src/pip/_internal/vcs/git.py | 18 ------------------ 2 files changed, 18 deletions(-) create mode 100644 news/921e9395-3424-4731-a909-2ce5e029d613.trivial.rst diff --git a/news/921e9395-3424-4731-a909-2ce5e029d613.trivial.rst b/news/921e9395-3424-4731-a909-2ce5e029d613.trivial.rst new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/pip/_internal/vcs/git.py b/src/pip/_internal/vcs/git.py index 1769da791cb..a10580eadc1 100644 --- a/src/pip/_internal/vcs/git.py +++ b/src/pip/_internal/vcs/git.py @@ -4,8 +4,6 @@ import os.path import pathlib import re -import urllib.parse -import urllib.request from dataclasses import replace from typing import Any @@ -22,9 +20,6 @@ vcs, ) -urlsplit = urllib.parse.urlsplit -urlunsplit = urllib.parse.urlunsplit - logger = logging.getLogger(__name__) @@ -500,19 +495,6 @@ def get_url_rev_and_auth(cls, url: str) -> tuple[str, str | None, AuthInfo]: work with a ssh:// scheme (e.g. GitHub). But we need a scheme for parsing. Hence we remove it again afterwards and return it as a stub. """ - # Works around an apparent Git bug - # (see https://article.gmane.org/gmane.comp.version-control.git/146500) - scheme, netloc, path, query, fragment = urlsplit(url) - if scheme.endswith("file"): - initial_slashes = path[: -len(path.lstrip("/"))] - newpath = initial_slashes + urllib.request.url2pathname(path).replace( - "\\", "/" - ).lstrip("/") - after_plus = scheme.find("+") + 1 - url = scheme[:after_plus] + urlunsplit( - (scheme[after_plus:], netloc, newpath, query, fragment), - ) - if "://" not in url: assert "file:" not in url url = url.replace("git+", "git+ssh://") From 722456bbb2f87b0a72f6d4b4ace8cf9cafd2d7d8 Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 21 Jul 2025 21:07:41 +0100 Subject: [PATCH 2/6] Try a different approach --- src/pip/_internal/models/link.py | 69 +----------- src/pip/_internal/utils/urls.py | 66 +++++++++++ src/pip/_internal/vcs/git.py | 5 + tests/unit/test_collector.py | 184 ------------------------------- 4 files changed, 74 insertions(+), 250 deletions(-) diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 87651c76e25..cf230857681 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -1,7 +1,6 @@ from __future__ import annotations import functools -import itertools import logging import os import posixpath @@ -19,12 +18,11 @@ from pip._internal.utils.filetypes import WHEEL_EXTENSION from pip._internal.utils.hashes import Hashes from pip._internal.utils.misc import ( - pairwise, redact_auth_from_url, split_auth_from_netloc, splitext, ) -from pip._internal.utils.urls import path_to_url, url_to_path +from pip._internal.utils.urls import path_to_url, url_to_path, clean_url if TYPE_CHECKING: from pip._internal.index.collector import IndexContent @@ -113,67 +111,6 @@ def supported_hashes(hashes: dict[str, str] | None) -> dict[str, str] | None: return hashes -def _clean_url_path_part(part: str) -> str: - """ - Clean a "part" of a URL path (i.e. after splitting on "@" characters). - """ - # We unquote prior to quoting to make sure nothing is double quoted. - return urllib.parse.quote(urllib.parse.unquote(part)) - - -def _clean_file_url_path(part: str) -> str: - """ - Clean the first part of a URL path that corresponds to a local - filesystem path (i.e. the first part after splitting on "@" characters). - """ - # We unquote prior to quoting to make sure nothing is double quoted. - # Also, on Windows the path part might contain a drive letter which - # should not be quoted. On Linux where drive letters do not - # exist, the colon should be quoted. We rely on urllib.request - # to do the right thing here. - return urllib.request.pathname2url(urllib.request.url2pathname(part)) - - -# percent-encoded: / -_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE) - - -def _clean_url_path(path: str, is_local_path: bool) -> str: - """ - Clean the path portion of a URL. - """ - if is_local_path: - clean_func = _clean_file_url_path - else: - clean_func = _clean_url_path_part - - # Split on the reserved characters prior to cleaning so that - # revision strings in VCS URLs are properly preserved. - parts = _reserved_chars_re.split(path) - - cleaned_parts = [] - for to_clean, reserved in pairwise(itertools.chain(parts, [""])): - cleaned_parts.append(clean_func(to_clean)) - # Normalize %xx escapes (e.g. %2f -> %2F) - cleaned_parts.append(reserved.upper()) - - return "".join(cleaned_parts) - - -def _ensure_quoted_url(url: str) -> str: - """ - Make sure a link is fully quoted. - For example, if ' ' occurs in the URL, it will be replaced with "%20", - and without double-quoting other characters. - """ - # Split the URL into parts according to the general structure - # `scheme://netloc/path?query#fragment`. - result = urllib.parse.urlsplit(url) - # If the netloc is empty, then the URL refers to a local filesystem path. - is_local_path = not result.netloc - path = _clean_url_path(result.path, is_local_path=is_local_path) - return urllib.parse.urlunsplit(result._replace(path=path)) - def _absolute_link_url(base_url: str, url: str) -> str: """ @@ -281,7 +218,7 @@ def from_json( if file_url is None: return None - url = _ensure_quoted_url(_absolute_link_url(page_url, file_url)) + url = clean_url(_absolute_link_url(page_url, file_url)) pyrequire = file_data.get("requires-python") yanked_reason = file_data.get("yanked") hashes = file_data.get("hashes", {}) @@ -333,7 +270,7 @@ def from_element( if not href: return None - url = _ensure_quoted_url(_absolute_link_url(base_url, href)) + url = clean_url(_absolute_link_url(base_url, href)) pyrequire = anchor_attribs.get("data-requires-python") yanked_reason = anchor_attribs.get("data-yanked") diff --git a/src/pip/_internal/utils/urls.py b/src/pip/_internal/utils/urls.py index 9f34f882a1a..8750ec92261 100644 --- a/src/pip/_internal/utils/urls.py +++ b/src/pip/_internal/utils/urls.py @@ -1,9 +1,12 @@ +import itertools import os import string +import re import urllib.parse import urllib.request from .compat import WINDOWS +from pip._internal.utils.misc import pairwise def path_to_url(path: str) -> str: @@ -53,3 +56,66 @@ def url_to_path(url: str) -> str: path = path[1:] return path + + +def _clean_url_path_part(part: str) -> str: + """ + Clean a "part" of a URL path (i.e. after splitting on "@" characters). + """ + # We unquote prior to quoting to make sure nothing is double quoted. + return urllib.parse.quote(urllib.parse.unquote(part)) + + +def _clean_file_url_path(part: str) -> str: + """ + Clean the first part of a URL path that corresponds to a local + filesystem path (i.e. the first part after splitting on "@" characters). + """ + # We unquote prior to quoting to make sure nothing is double quoted. + # Also, on Windows the path part might contain a drive letter which + # should not be quoted. On Linux where drive letters do not + # exist, the colon should be quoted. We rely on urllib.request + # to do the right thing here. + return urllib.request.pathname2url(urllib.request.url2pathname(part)) + + +# percent-encoded: / +_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE) + + +def _clean_url_path(path: str, is_local_path: bool) -> str: + """ + Clean the path portion of a URL. + """ + if is_local_path: + clean_func = _clean_file_url_path + else: + clean_func = _clean_url_path_part + + # Split on the reserved characters prior to cleaning so that + # revision strings in VCS URLs are properly preserved. + parts = _reserved_chars_re.split(path) + + cleaned_parts = [] + for to_clean, reserved in pairwise(itertools.chain(parts, [""])): + cleaned_parts.append(clean_func(to_clean)) + # Normalize %xx escapes (e.g. %2f -> %2F) + cleaned_parts.append(reserved.upper()) + + return "".join(cleaned_parts) + + +def clean_url(url: str) -> str: + """ + Make sure a link is fully quoted. + For example, if ' ' occurs in the URL, it will be replaced with "%20", + and without double-quoting other characters. + """ + # Split the URL into parts according to the general structure + # `scheme://netloc/path?query#fragment`. + result = urllib.parse.urlsplit(url) + # If the netloc is empty, then the URL refers to a local filesystem path. + is_local_path = not result.netloc + path = _clean_url_path(result.path, is_local_path=is_local_path) + return urllib.parse.urlunsplit(result._replace(path=path)) + diff --git a/src/pip/_internal/vcs/git.py b/src/pip/_internal/vcs/git.py index a10580eadc1..7890ec1c381 100644 --- a/src/pip/_internal/vcs/git.py +++ b/src/pip/_internal/vcs/git.py @@ -10,6 +10,7 @@ from pip._internal.exceptions import BadCommand, InstallationError from pip._internal.utils.misc import HiddenText, display_path, hide_url from pip._internal.utils.subprocess import make_command +from pip._internal.utils.urls import clean_url from pip._internal.vcs.versioncontrol import ( AuthInfo, RemoteNotFoundError, @@ -495,6 +496,10 @@ def get_url_rev_and_auth(cls, url: str) -> tuple[str, str | None, AuthInfo]: work with a ssh:// scheme (e.g. GitHub). But we need a scheme for parsing. Hence we remove it again afterwards and return it as a stub. """ + # Works around an apparent Git bug + # (see https://article.gmane.org/gmane.comp.version-control.git/146500) + url = clean_url(url) + if "://" not in url: assert "file:" not in url url = url.replace("git+", "git+ssh://") diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index d95fa10fea3..24833407b39 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -33,8 +33,6 @@ Link, LinkHash, MetadataFile, - _clean_url_path, - _ensure_quoted_url, ) from pip._internal.network.session import PipSession @@ -263,188 +261,6 @@ def test_get_simple_response_dont_log_clear_text_password( ] -@pytest.mark.parametrize( - "path, expected", - [ - # Test a character that needs quoting. - ("a b", "a%20b"), - # Test an unquoted "@". - ("a @ b", "a%20@%20b"), - # Test multiple unquoted "@". - ("a @ @ b", "a%20@%20@%20b"), - # Test a quoted "@". - ("a %40 b", "a%20%40%20b"), - # Test a quoted "@" before an unquoted "@". - ("a %40b@ c", "a%20%40b@%20c"), - # Test a quoted "@" after an unquoted "@". - ("a @b%40 c", "a%20@b%40%20c"), - # Test alternating quoted and unquoted "@". - ("a %40@b %40@c %40", "a%20%40@b%20%40@c%20%40"), - # Test an unquoted "/". - ("a / b", "a%20/%20b"), - # Test multiple unquoted "/". - ("a / / b", "a%20/%20/%20b"), - # Test a quoted "/". - ("a %2F b", "a%20%2F%20b"), - # Test a quoted "/" before an unquoted "/". - ("a %2Fb/ c", "a%20%2Fb/%20c"), - # Test a quoted "/" after an unquoted "/". - ("a /b%2F c", "a%20/b%2F%20c"), - # Test alternating quoted and unquoted "/". - ("a %2F/b %2F/c %2F", "a%20%2F/b%20%2F/c%20%2F"), - # Test normalizing non-reserved quoted characters "[" and "]" - ("a %5b %5d b", "a%20%5B%20%5D%20b"), - # Test normalizing a reserved quoted "/" - ("a %2f b", "a%20%2F%20b"), - ], -) -@pytest.mark.parametrize("is_local_path", [True, False]) -def test_clean_url_path(path: str, expected: str, is_local_path: bool) -> None: - assert _clean_url_path(path, is_local_path=is_local_path) == expected - - -@pytest.mark.parametrize( - "path, expected", - [ - # Test a VCS path with a Windows drive letter and revision. - pytest.param( - "/T:/with space/repo.git@1.0", - "///T:/with%20space/repo.git@1.0", - marks=pytest.mark.skipif("sys.platform != 'win32'"), - ), - # Test a VCS path with a Windows drive letter and revision, - # running on non-windows platform. - pytest.param( - "/T:/with space/repo.git@1.0", - "/T%3A/with%20space/repo.git@1.0", - marks=pytest.mark.skipif("sys.platform == 'win32'"), - ), - ], -) -def test_clean_url_path_with_local_path(path: str, expected: str) -> None: - actual = _clean_url_path(path, is_local_path=True) - assert actual == expected - - -@pytest.mark.parametrize( - "url, clean_url", - [ - # URL with hostname and port. Port separator should not be quoted. - ( - "https://localhost.localdomain:8181/path/with space/", - "https://localhost.localdomain:8181/path/with%20space/", - ), - # URL that is already properly quoted. The quoting `%` - # characters should not be quoted again. - ( - "https://localhost.localdomain:8181/path/with%20quoted%20space/", - "https://localhost.localdomain:8181/path/with%20quoted%20space/", - ), - # URL with IPv4 address and port. - ( - "https://127.0.0.1:8181/path/with space/", - "https://127.0.0.1:8181/path/with%20space/", - ), - # URL with IPv6 address and port. The `[]` brackets around the - # IPv6 address should not be quoted. - ( - "https://[fd00:0:0:236::100]:8181/path/with space/", - "https://[fd00:0:0:236::100]:8181/path/with%20space/", - ), - # URL with query. The leading `?` should not be quoted. - ( - "https://localhost.localdomain:8181/path/with/query?request=test", - "https://localhost.localdomain:8181/path/with/query?request=test", - ), - # URL with colon in the path portion. - ( - "https://localhost.localdomain:8181/path:/with:/colon", - "https://localhost.localdomain:8181/path%3A/with%3A/colon", - ), - # URL with something that looks like a drive letter, but is - # not. The `:` should be quoted. - ( - "https://localhost.localdomain/T:/path/", - "https://localhost.localdomain/T%3A/path/", - ), - # URL with a quoted "/" in the path portion. - ( - "https://example.com/access%2Ftoken/path/", - "https://example.com/access%2Ftoken/path/", - ), - # VCS URL containing revision string. - ( - "git+ssh://example.com/path to/repo.git@1.0#egg=my-package-1.0", - "git+ssh://example.com/path%20to/repo.git@1.0#egg=my-package-1.0", - ), - # VCS URL with a quoted "#" in the revision string. - ( - "git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0", - "git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0", - ), - # VCS URL with a quoted "@" in the revision string. - ( - "git+https://example.com/repo.git@at%40 space#egg=my-package-1.0", - "git+https://example.com/repo.git@at%40%20space#egg=my-package-1.0", - ), - # URL with Windows drive letter. The `:` after the drive - # letter should not be quoted. The trailing `/` should be - # removed. - pytest.param( - "file:///T:/path/with spaces/", - "file:///T:/path/with%20spaces", - marks=[ - skip_needs_old_urlun_behavior_win, - skip_needs_old_pathname2url_trailing_slash_behavior_win, - ], - ), - pytest.param( - "file:///T:/path/with spaces/", - "file://///T:/path/with%20spaces", - marks=[ - skip_needs_new_urlun_behavior_win, - skip_needs_old_pathname2url_trailing_slash_behavior_win, - ], - ), - pytest.param( - "file:///T:/path/with spaces/", - "file://///T:/path/with%20spaces/", - marks=[ - skip_needs_new_urlun_behavior_win, - skip_needs_new_pathname2url_trailing_slash_behavior_win, - ], - ), - # URL with Windows drive letter, running on non-windows - # platform. The `:` after the drive should be quoted. - pytest.param( - "file:///T:/path/with spaces/", - "file:///T%3A/path/with%20spaces/", - marks=pytest.mark.skipif("sys.platform == 'win32'"), - ), - # Test a VCS URL with a Windows drive letter and revision. - pytest.param( - "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", - "git+file:///T:/with%20space/repo.git@1.0#egg=my-package-1.0", - marks=skip_needs_old_urlun_behavior_win, - ), - pytest.param( - "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", - "git+file://///T:/with%20space/repo.git@1.0#egg=my-package-1.0", - marks=skip_needs_new_urlun_behavior_win, - ), - # Test a VCS URL with a Windows drive letter and revision, - # running on non-windows platform. - pytest.param( - "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", - "git+file:/T%3A/with%20space/repo.git@1.0#egg=my-package-1.0", - marks=pytest.mark.skipif("sys.platform == 'win32'"), - ), - ], -) -def test_ensure_quoted_url(url: str, clean_url: str) -> None: - assert _ensure_quoted_url(url) == clean_url - - def _test_parse_links_data_attribute( anchor_html: str, attr: str, expected: str | None ) -> Link: From 417c09b200c7557226ea48b3d09eff8ff30812c6 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 23 Jul 2025 17:56:17 +0100 Subject: [PATCH 3/6] Preserve `git+` prefix --- src/pip/_internal/vcs/git.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pip/_internal/vcs/git.py b/src/pip/_internal/vcs/git.py index 7890ec1c381..323c6e41b24 100644 --- a/src/pip/_internal/vcs/git.py +++ b/src/pip/_internal/vcs/git.py @@ -498,7 +498,8 @@ def get_url_rev_and_auth(cls, url: str) -> tuple[str, str | None, AuthInfo]: """ # Works around an apparent Git bug # (see https://article.gmane.org/gmane.comp.version-control.git/146500) - url = clean_url(url) + prefix, plus, url = url.partition("+") + url = prefix + plus + clean_url(url) if "://" not in url: assert "file:" not in url From 959c47373f07781361fb9b71b07c058d8426d99d Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 23 Jul 2025 18:14:38 +0100 Subject: [PATCH 4/6] Handle stub URLs --- src/pip/_internal/vcs/git.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pip/_internal/vcs/git.py b/src/pip/_internal/vcs/git.py index 323c6e41b24..afe7a220e14 100644 --- a/src/pip/_internal/vcs/git.py +++ b/src/pip/_internal/vcs/git.py @@ -498,8 +498,8 @@ def get_url_rev_and_auth(cls, url: str) -> tuple[str, str | None, AuthInfo]: """ # Works around an apparent Git bug # (see https://article.gmane.org/gmane.comp.version-control.git/146500) - prefix, plus, url = url.partition("+") - url = prefix + plus + clean_url(url) + if url.startswith("git+file:"): + url = "git+" + clean_url(url[4:]) if "://" not in url: assert "file:" not in url From 89af2dad5cd25745a2c5cd5747081482c7b79982 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 23 Jul 2025 18:19:07 +0100 Subject: [PATCH 5/6] Restore deleted tests --- tests/unit/test_collector.py | 9 +- tests/unit/test_urls.py | 196 ++++++++++++++++++++++++++++++++++- 2 files changed, 196 insertions(+), 9 deletions(-) diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 24833407b39..527b589c45c 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -36,14 +36,7 @@ ) from pip._internal.network.session import PipSession -from tests.lib import ( - TestData, - make_test_link_collector, - skip_needs_new_pathname2url_trailing_slash_behavior_win, - skip_needs_new_urlun_behavior_win, - skip_needs_old_pathname2url_trailing_slash_behavior_win, - skip_needs_old_urlun_behavior_win, -) +from tests.lib import TestData, make_test_link_collector ACCEPT = ", ".join( [ diff --git a/tests/unit/test_urls.py b/tests/unit/test_urls.py index 0c145255080..1caad64f1e9 100644 --- a/tests/unit/test_urls.py +++ b/tests/unit/test_urls.py @@ -4,7 +4,19 @@ import pytest -from pip._internal.utils.urls import path_to_url, url_to_path +from pip._internal.utils.urls import ( + path_to_url, + url_to_path, + clean_url, + _clean_url_path, +) + +from tests.lib import ( + skip_needs_new_pathname2url_trailing_slash_behavior_win, + skip_needs_new_urlun_behavior_win, + skip_needs_old_pathname2url_trailing_slash_behavior_win, + skip_needs_old_urlun_behavior_win, +) @pytest.mark.skipif("sys.platform == 'win32'") @@ -75,3 +87,185 @@ def test_url_to_path_path_to_url_symmetry_win() -> None: unc_path = r"\\unc\share\path" assert url_to_path(path_to_url(unc_path)) == unc_path + +@pytest.mark.parametrize( + "path, expected", + [ + # Test a character that needs quoting. + ("a b", "a%20b"), + # Test an unquoted "@". + ("a @ b", "a%20@%20b"), + # Test multiple unquoted "@". + ("a @ @ b", "a%20@%20@%20b"), + # Test a quoted "@". + ("a %40 b", "a%20%40%20b"), + # Test a quoted "@" before an unquoted "@". + ("a %40b@ c", "a%20%40b@%20c"), + # Test a quoted "@" after an unquoted "@". + ("a @b%40 c", "a%20@b%40%20c"), + # Test alternating quoted and unquoted "@". + ("a %40@b %40@c %40", "a%20%40@b%20%40@c%20%40"), + # Test an unquoted "/". + ("a / b", "a%20/%20b"), + # Test multiple unquoted "/". + ("a / / b", "a%20/%20/%20b"), + # Test a quoted "/". + ("a %2F b", "a%20%2F%20b"), + # Test a quoted "/" before an unquoted "/". + ("a %2Fb/ c", "a%20%2Fb/%20c"), + # Test a quoted "/" after an unquoted "/". + ("a /b%2F c", "a%20/b%2F%20c"), + # Test alternating quoted and unquoted "/". + ("a %2F/b %2F/c %2F", "a%20%2F/b%20%2F/c%20%2F"), + # Test normalizing non-reserved quoted characters "[" and "]" + ("a %5b %5d b", "a%20%5B%20%5D%20b"), + # Test normalizing a reserved quoted "/" + ("a %2f b", "a%20%2F%20b"), + ], +) +@pytest.mark.parametrize("is_local_path", [True, False]) +def test_clean_url_path(path: str, expected: str, is_local_path: bool) -> None: + assert _clean_url_path(path, is_local_path=is_local_path) == expected + + +@pytest.mark.parametrize( + "path, expected", + [ + # Test a VCS path with a Windows drive letter and revision. + pytest.param( + "/T:/with space/repo.git@1.0", + "///T:/with%20space/repo.git@1.0", + marks=pytest.mark.skipif("sys.platform != 'win32'"), + ), + # Test a VCS path with a Windows drive letter and revision, + # running on non-windows platform. + pytest.param( + "/T:/with space/repo.git@1.0", + "/T%3A/with%20space/repo.git@1.0", + marks=pytest.mark.skipif("sys.platform == 'win32'"), + ), + ], +) +def test_clean_url_path_with_local_path(path: str, expected: str) -> None: + actual = _clean_url_path(path, is_local_path=True) + assert actual == expected + + +@pytest.mark.parametrize( + "url, expected_url", + [ + # URL with hostname and port. Port separator should not be quoted. + ( + "https://localhost.localdomain:8181/path/with space/", + "https://localhost.localdomain:8181/path/with%20space/", + ), + # URL that is already properly quoted. The quoting `%` + # characters should not be quoted again. + ( + "https://localhost.localdomain:8181/path/with%20quoted%20space/", + "https://localhost.localdomain:8181/path/with%20quoted%20space/", + ), + # URL with IPv4 address and port. + ( + "https://127.0.0.1:8181/path/with space/", + "https://127.0.0.1:8181/path/with%20space/", + ), + # URL with IPv6 address and port. The `[]` brackets around the + # IPv6 address should not be quoted. + ( + "https://[fd00:0:0:236::100]:8181/path/with space/", + "https://[fd00:0:0:236::100]:8181/path/with%20space/", + ), + # URL with query. The leading `?` should not be quoted. + ( + "https://localhost.localdomain:8181/path/with/query?request=test", + "https://localhost.localdomain:8181/path/with/query?request=test", + ), + # URL with colon in the path portion. + ( + "https://localhost.localdomain:8181/path:/with:/colon", + "https://localhost.localdomain:8181/path%3A/with%3A/colon", + ), + # URL with something that looks like a drive letter, but is + # not. The `:` should be quoted. + ( + "https://localhost.localdomain/T:/path/", + "https://localhost.localdomain/T%3A/path/", + ), + # URL with a quoted "/" in the path portion. + ( + "https://example.com/access%2Ftoken/path/", + "https://example.com/access%2Ftoken/path/", + ), + # VCS URL containing revision string. + ( + "git+ssh://example.com/path to/repo.git@1.0#egg=my-package-1.0", + "git+ssh://example.com/path%20to/repo.git@1.0#egg=my-package-1.0", + ), + # VCS URL with a quoted "#" in the revision string. + ( + "git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0", + "git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0", + ), + # VCS URL with a quoted "@" in the revision string. + ( + "git+https://example.com/repo.git@at%40 space#egg=my-package-1.0", + "git+https://example.com/repo.git@at%40%20space#egg=my-package-1.0", + ), + # URL with Windows drive letter. The `:` after the drive + # letter should not be quoted. The trailing `/` should be + # removed. + pytest.param( + "file:///T:/path/with spaces/", + "file:///T:/path/with%20spaces", + marks=[ + skip_needs_old_urlun_behavior_win, + skip_needs_old_pathname2url_trailing_slash_behavior_win, + ], + ), + pytest.param( + "file:///T:/path/with spaces/", + "file://///T:/path/with%20spaces", + marks=[ + skip_needs_new_urlun_behavior_win, + skip_needs_old_pathname2url_trailing_slash_behavior_win, + ], + ), + pytest.param( + "file:///T:/path/with spaces/", + "file://///T:/path/with%20spaces/", + marks=[ + skip_needs_new_urlun_behavior_win, + skip_needs_new_pathname2url_trailing_slash_behavior_win, + ], + ), + # URL with Windows drive letter, running on non-windows + # platform. The `:` after the drive should be quoted. + pytest.param( + "file:///T:/path/with spaces/", + "file:///T%3A/path/with%20spaces/", + marks=pytest.mark.skipif("sys.platform == 'win32'"), + ), + # Test a VCS URL with a Windows drive letter and revision. + pytest.param( + "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", + "git+file:///T:/with%20space/repo.git@1.0#egg=my-package-1.0", + marks=skip_needs_old_urlun_behavior_win, + ), + pytest.param( + "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", + "git+file://///T:/with%20space/repo.git@1.0#egg=my-package-1.0", + marks=skip_needs_new_urlun_behavior_win, + ), + # Test a VCS URL with a Windows drive letter and revision, + # running on non-windows platform. + pytest.param( + "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", + "git+file:/T%3A/with%20space/repo.git@1.0#egg=my-package-1.0", + marks=pytest.mark.skipif("sys.platform == 'win32'"), + ), + ], +) +def test_clean_url(url: str, expected_url: str) -> None: + assert clean_url(url) == expected_url + From 72ef7d4c574062c67b0557b812fd570b9378c05c Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 23 Jul 2025 18:36:44 +0100 Subject: [PATCH 6/6] Fix lint --- src/pip/_internal/models/link.py | 3 +-- src/pip/_internal/utils/urls.py | 6 +++--- src/pip/_internal/vcs/git.py | 1 - tests/unit/test_urls.py | 6 +++--- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index cf230857681..d17c48e04e8 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -22,7 +22,7 @@ split_auth_from_netloc, splitext, ) -from pip._internal.utils.urls import path_to_url, url_to_path, clean_url +from pip._internal.utils.urls import clean_url, path_to_url, url_to_path if TYPE_CHECKING: from pip._internal.index.collector import IndexContent @@ -111,7 +111,6 @@ def supported_hashes(hashes: dict[str, str] | None) -> dict[str, str] | None: return hashes - def _absolute_link_url(base_url: str, url: str) -> str: """ A faster implementation of urllib.parse.urljoin with a shortcut diff --git a/src/pip/_internal/utils/urls.py b/src/pip/_internal/utils/urls.py index 8750ec92261..66ea445718f 100644 --- a/src/pip/_internal/utils/urls.py +++ b/src/pip/_internal/utils/urls.py @@ -1,13 +1,14 @@ import itertools import os -import string import re +import string import urllib.parse import urllib.request -from .compat import WINDOWS from pip._internal.utils.misc import pairwise +from .compat import WINDOWS + def path_to_url(path: str) -> str: """ @@ -118,4 +119,3 @@ def clean_url(url: str) -> str: is_local_path = not result.netloc path = _clean_url_path(result.path, is_local_path=is_local_path) return urllib.parse.urlunsplit(result._replace(path=path)) - diff --git a/src/pip/_internal/vcs/git.py b/src/pip/_internal/vcs/git.py index afe7a220e14..8d2eeb4fb09 100644 --- a/src/pip/_internal/vcs/git.py +++ b/src/pip/_internal/vcs/git.py @@ -21,7 +21,6 @@ vcs, ) - logger = logging.getLogger(__name__) diff --git a/tests/unit/test_urls.py b/tests/unit/test_urls.py index 1caad64f1e9..7e02b09e1fb 100644 --- a/tests/unit/test_urls.py +++ b/tests/unit/test_urls.py @@ -5,10 +5,10 @@ import pytest from pip._internal.utils.urls import ( + _clean_url_path, + clean_url, path_to_url, url_to_path, - clean_url, - _clean_url_path, ) from tests.lib import ( @@ -88,6 +88,7 @@ def test_url_to_path_path_to_url_symmetry_win() -> None: unc_path = r"\\unc\share\path" assert url_to_path(path_to_url(unc_path)) == unc_path + @pytest.mark.parametrize( "path, expected", [ @@ -268,4 +269,3 @@ def test_clean_url_path_with_local_path(path: str, expected: str) -> None: ) def test_clean_url(url: str, expected_url: str) -> None: assert clean_url(url) == expected_url -