diff --git a/news/921e9395-3424-4731-a909-2ce5e029d613.trivial.rst b/news/921e9395-3424-4731-a909-2ce5e029d613.trivial.rst new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 87651c76e25..d17c48e04e8 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -1,7 +1,6 @@ from __future__ import annotations import functools -import itertools import logging import os import posixpath @@ -19,12 +18,11 @@ from pip._internal.utils.filetypes import WHEEL_EXTENSION from pip._internal.utils.hashes import Hashes from pip._internal.utils.misc import ( - pairwise, redact_auth_from_url, split_auth_from_netloc, splitext, ) -from pip._internal.utils.urls import path_to_url, url_to_path +from pip._internal.utils.urls import clean_url, path_to_url, url_to_path if TYPE_CHECKING: from pip._internal.index.collector import IndexContent @@ -113,68 +111,6 @@ def supported_hashes(hashes: dict[str, str] | None) -> dict[str, str] | None: return hashes -def _clean_url_path_part(part: str) -> str: - """ - Clean a "part" of a URL path (i.e. after splitting on "@" characters). - """ - # We unquote prior to quoting to make sure nothing is double quoted. - return urllib.parse.quote(urllib.parse.unquote(part)) - - -def _clean_file_url_path(part: str) -> str: - """ - Clean the first part of a URL path that corresponds to a local - filesystem path (i.e. the first part after splitting on "@" characters). - """ - # We unquote prior to quoting to make sure nothing is double quoted. - # Also, on Windows the path part might contain a drive letter which - # should not be quoted. On Linux where drive letters do not - # exist, the colon should be quoted. We rely on urllib.request - # to do the right thing here. - return urllib.request.pathname2url(urllib.request.url2pathname(part)) - - -# percent-encoded: / -_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE) - - -def _clean_url_path(path: str, is_local_path: bool) -> str: - """ - Clean the path portion of a URL. - """ - if is_local_path: - clean_func = _clean_file_url_path - else: - clean_func = _clean_url_path_part - - # Split on the reserved characters prior to cleaning so that - # revision strings in VCS URLs are properly preserved. - parts = _reserved_chars_re.split(path) - - cleaned_parts = [] - for to_clean, reserved in pairwise(itertools.chain(parts, [""])): - cleaned_parts.append(clean_func(to_clean)) - # Normalize %xx escapes (e.g. %2f -> %2F) - cleaned_parts.append(reserved.upper()) - - return "".join(cleaned_parts) - - -def _ensure_quoted_url(url: str) -> str: - """ - Make sure a link is fully quoted. - For example, if ' ' occurs in the URL, it will be replaced with "%20", - and without double-quoting other characters. - """ - # Split the URL into parts according to the general structure - # `scheme://netloc/path?query#fragment`. - result = urllib.parse.urlsplit(url) - # If the netloc is empty, then the URL refers to a local filesystem path. - is_local_path = not result.netloc - path = _clean_url_path(result.path, is_local_path=is_local_path) - return urllib.parse.urlunsplit(result._replace(path=path)) - - def _absolute_link_url(base_url: str, url: str) -> str: """ A faster implementation of urllib.parse.urljoin with a shortcut @@ -281,7 +217,7 @@ def from_json( if file_url is None: return None - url = _ensure_quoted_url(_absolute_link_url(page_url, file_url)) + url = clean_url(_absolute_link_url(page_url, file_url)) pyrequire = file_data.get("requires-python") yanked_reason = file_data.get("yanked") hashes = file_data.get("hashes", {}) @@ -333,7 +269,7 @@ def from_element( if not href: return None - url = _ensure_quoted_url(_absolute_link_url(base_url, href)) + url = clean_url(_absolute_link_url(base_url, href)) pyrequire = anchor_attribs.get("data-requires-python") yanked_reason = anchor_attribs.get("data-yanked") diff --git a/src/pip/_internal/utils/urls.py b/src/pip/_internal/utils/urls.py index 9f34f882a1a..66ea445718f 100644 --- a/src/pip/_internal/utils/urls.py +++ b/src/pip/_internal/utils/urls.py @@ -1,8 +1,12 @@ +import itertools import os +import re import string import urllib.parse import urllib.request +from pip._internal.utils.misc import pairwise + from .compat import WINDOWS @@ -53,3 +57,65 @@ def url_to_path(url: str) -> str: path = path[1:] return path + + +def _clean_url_path_part(part: str) -> str: + """ + Clean a "part" of a URL path (i.e. after splitting on "@" characters). + """ + # We unquote prior to quoting to make sure nothing is double quoted. + return urllib.parse.quote(urllib.parse.unquote(part)) + + +def _clean_file_url_path(part: str) -> str: + """ + Clean the first part of a URL path that corresponds to a local + filesystem path (i.e. the first part after splitting on "@" characters). + """ + # We unquote prior to quoting to make sure nothing is double quoted. + # Also, on Windows the path part might contain a drive letter which + # should not be quoted. On Linux where drive letters do not + # exist, the colon should be quoted. We rely on urllib.request + # to do the right thing here. + return urllib.request.pathname2url(urllib.request.url2pathname(part)) + + +# percent-encoded: / +_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE) + + +def _clean_url_path(path: str, is_local_path: bool) -> str: + """ + Clean the path portion of a URL. + """ + if is_local_path: + clean_func = _clean_file_url_path + else: + clean_func = _clean_url_path_part + + # Split on the reserved characters prior to cleaning so that + # revision strings in VCS URLs are properly preserved. + parts = _reserved_chars_re.split(path) + + cleaned_parts = [] + for to_clean, reserved in pairwise(itertools.chain(parts, [""])): + cleaned_parts.append(clean_func(to_clean)) + # Normalize %xx escapes (e.g. %2f -> %2F) + cleaned_parts.append(reserved.upper()) + + return "".join(cleaned_parts) + + +def clean_url(url: str) -> str: + """ + Make sure a link is fully quoted. + For example, if ' ' occurs in the URL, it will be replaced with "%20", + and without double-quoting other characters. + """ + # Split the URL into parts according to the general structure + # `scheme://netloc/path?query#fragment`. + result = urllib.parse.urlsplit(url) + # If the netloc is empty, then the URL refers to a local filesystem path. + is_local_path = not result.netloc + path = _clean_url_path(result.path, is_local_path=is_local_path) + return urllib.parse.urlunsplit(result._replace(path=path)) diff --git a/src/pip/_internal/vcs/git.py b/src/pip/_internal/vcs/git.py index 1769da791cb..8d2eeb4fb09 100644 --- a/src/pip/_internal/vcs/git.py +++ b/src/pip/_internal/vcs/git.py @@ -4,14 +4,13 @@ import os.path import pathlib import re -import urllib.parse -import urllib.request from dataclasses import replace from typing import Any from pip._internal.exceptions import BadCommand, InstallationError from pip._internal.utils.misc import HiddenText, display_path, hide_url from pip._internal.utils.subprocess import make_command +from pip._internal.utils.urls import clean_url from pip._internal.vcs.versioncontrol import ( AuthInfo, RemoteNotFoundError, @@ -22,10 +21,6 @@ vcs, ) -urlsplit = urllib.parse.urlsplit -urlunsplit = urllib.parse.urlunsplit - - logger = logging.getLogger(__name__) @@ -502,16 +497,8 @@ def get_url_rev_and_auth(cls, url: str) -> tuple[str, str | None, AuthInfo]: """ # Works around an apparent Git bug # (see https://article.gmane.org/gmane.comp.version-control.git/146500) - scheme, netloc, path, query, fragment = urlsplit(url) - if scheme.endswith("file"): - initial_slashes = path[: -len(path.lstrip("/"))] - newpath = initial_slashes + urllib.request.url2pathname(path).replace( - "\\", "/" - ).lstrip("/") - after_plus = scheme.find("+") + 1 - url = scheme[:after_plus] + urlunsplit( - (scheme[after_plus:], netloc, newpath, query, fragment), - ) + if url.startswith("git+file:"): + url = "git+" + clean_url(url[4:]) if "://" not in url: assert "file:" not in url diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index d95fa10fea3..527b589c45c 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -33,19 +33,10 @@ Link, LinkHash, MetadataFile, - _clean_url_path, - _ensure_quoted_url, ) from pip._internal.network.session import PipSession -from tests.lib import ( - TestData, - make_test_link_collector, - skip_needs_new_pathname2url_trailing_slash_behavior_win, - skip_needs_new_urlun_behavior_win, - skip_needs_old_pathname2url_trailing_slash_behavior_win, - skip_needs_old_urlun_behavior_win, -) +from tests.lib import TestData, make_test_link_collector ACCEPT = ", ".join( [ @@ -263,188 +254,6 @@ def test_get_simple_response_dont_log_clear_text_password( ] -@pytest.mark.parametrize( - "path, expected", - [ - # Test a character that needs quoting. - ("a b", "a%20b"), - # Test an unquoted "@". - ("a @ b", "a%20@%20b"), - # Test multiple unquoted "@". - ("a @ @ b", "a%20@%20@%20b"), - # Test a quoted "@". - ("a %40 b", "a%20%40%20b"), - # Test a quoted "@" before an unquoted "@". - ("a %40b@ c", "a%20%40b@%20c"), - # Test a quoted "@" after an unquoted "@". - ("a @b%40 c", "a%20@b%40%20c"), - # Test alternating quoted and unquoted "@". - ("a %40@b %40@c %40", "a%20%40@b%20%40@c%20%40"), - # Test an unquoted "/". - ("a / b", "a%20/%20b"), - # Test multiple unquoted "/". - ("a / / b", "a%20/%20/%20b"), - # Test a quoted "/". - ("a %2F b", "a%20%2F%20b"), - # Test a quoted "/" before an unquoted "/". - ("a %2Fb/ c", "a%20%2Fb/%20c"), - # Test a quoted "/" after an unquoted "/". - ("a /b%2F c", "a%20/b%2F%20c"), - # Test alternating quoted and unquoted "/". - ("a %2F/b %2F/c %2F", "a%20%2F/b%20%2F/c%20%2F"), - # Test normalizing non-reserved quoted characters "[" and "]" - ("a %5b %5d b", "a%20%5B%20%5D%20b"), - # Test normalizing a reserved quoted "/" - ("a %2f b", "a%20%2F%20b"), - ], -) -@pytest.mark.parametrize("is_local_path", [True, False]) -def test_clean_url_path(path: str, expected: str, is_local_path: bool) -> None: - assert _clean_url_path(path, is_local_path=is_local_path) == expected - - -@pytest.mark.parametrize( - "path, expected", - [ - # Test a VCS path with a Windows drive letter and revision. - pytest.param( - "/T:/with space/repo.git@1.0", - "///T:/with%20space/repo.git@1.0", - marks=pytest.mark.skipif("sys.platform != 'win32'"), - ), - # Test a VCS path with a Windows drive letter and revision, - # running on non-windows platform. - pytest.param( - "/T:/with space/repo.git@1.0", - "/T%3A/with%20space/repo.git@1.0", - marks=pytest.mark.skipif("sys.platform == 'win32'"), - ), - ], -) -def test_clean_url_path_with_local_path(path: str, expected: str) -> None: - actual = _clean_url_path(path, is_local_path=True) - assert actual == expected - - -@pytest.mark.parametrize( - "url, clean_url", - [ - # URL with hostname and port. Port separator should not be quoted. - ( - "https://localhost.localdomain:8181/path/with space/", - "https://localhost.localdomain:8181/path/with%20space/", - ), - # URL that is already properly quoted. The quoting `%` - # characters should not be quoted again. - ( - "https://localhost.localdomain:8181/path/with%20quoted%20space/", - "https://localhost.localdomain:8181/path/with%20quoted%20space/", - ), - # URL with IPv4 address and port. - ( - "https://127.0.0.1:8181/path/with space/", - "https://127.0.0.1:8181/path/with%20space/", - ), - # URL with IPv6 address and port. The `[]` brackets around the - # IPv6 address should not be quoted. - ( - "https://[fd00:0:0:236::100]:8181/path/with space/", - "https://[fd00:0:0:236::100]:8181/path/with%20space/", - ), - # URL with query. The leading `?` should not be quoted. - ( - "https://localhost.localdomain:8181/path/with/query?request=test", - "https://localhost.localdomain:8181/path/with/query?request=test", - ), - # URL with colon in the path portion. - ( - "https://localhost.localdomain:8181/path:/with:/colon", - "https://localhost.localdomain:8181/path%3A/with%3A/colon", - ), - # URL with something that looks like a drive letter, but is - # not. The `:` should be quoted. - ( - "https://localhost.localdomain/T:/path/", - "https://localhost.localdomain/T%3A/path/", - ), - # URL with a quoted "/" in the path portion. - ( - "https://example.com/access%2Ftoken/path/", - "https://example.com/access%2Ftoken/path/", - ), - # VCS URL containing revision string. - ( - "git+ssh://example.com/path to/repo.git@1.0#egg=my-package-1.0", - "git+ssh://example.com/path%20to/repo.git@1.0#egg=my-package-1.0", - ), - # VCS URL with a quoted "#" in the revision string. - ( - "git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0", - "git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0", - ), - # VCS URL with a quoted "@" in the revision string. - ( - "git+https://example.com/repo.git@at%40 space#egg=my-package-1.0", - "git+https://example.com/repo.git@at%40%20space#egg=my-package-1.0", - ), - # URL with Windows drive letter. The `:` after the drive - # letter should not be quoted. The trailing `/` should be - # removed. - pytest.param( - "file:///T:/path/with spaces/", - "file:///T:/path/with%20spaces", - marks=[ - skip_needs_old_urlun_behavior_win, - skip_needs_old_pathname2url_trailing_slash_behavior_win, - ], - ), - pytest.param( - "file:///T:/path/with spaces/", - "file://///T:/path/with%20spaces", - marks=[ - skip_needs_new_urlun_behavior_win, - skip_needs_old_pathname2url_trailing_slash_behavior_win, - ], - ), - pytest.param( - "file:///T:/path/with spaces/", - "file://///T:/path/with%20spaces/", - marks=[ - skip_needs_new_urlun_behavior_win, - skip_needs_new_pathname2url_trailing_slash_behavior_win, - ], - ), - # URL with Windows drive letter, running on non-windows - # platform. The `:` after the drive should be quoted. - pytest.param( - "file:///T:/path/with spaces/", - "file:///T%3A/path/with%20spaces/", - marks=pytest.mark.skipif("sys.platform == 'win32'"), - ), - # Test a VCS URL with a Windows drive letter and revision. - pytest.param( - "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", - "git+file:///T:/with%20space/repo.git@1.0#egg=my-package-1.0", - marks=skip_needs_old_urlun_behavior_win, - ), - pytest.param( - "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", - "git+file://///T:/with%20space/repo.git@1.0#egg=my-package-1.0", - marks=skip_needs_new_urlun_behavior_win, - ), - # Test a VCS URL with a Windows drive letter and revision, - # running on non-windows platform. - pytest.param( - "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", - "git+file:/T%3A/with%20space/repo.git@1.0#egg=my-package-1.0", - marks=pytest.mark.skipif("sys.platform == 'win32'"), - ), - ], -) -def test_ensure_quoted_url(url: str, clean_url: str) -> None: - assert _ensure_quoted_url(url) == clean_url - - def _test_parse_links_data_attribute( anchor_html: str, attr: str, expected: str | None ) -> Link: diff --git a/tests/unit/test_urls.py b/tests/unit/test_urls.py index 0c145255080..7e02b09e1fb 100644 --- a/tests/unit/test_urls.py +++ b/tests/unit/test_urls.py @@ -4,7 +4,19 @@ import pytest -from pip._internal.utils.urls import path_to_url, url_to_path +from pip._internal.utils.urls import ( + _clean_url_path, + clean_url, + path_to_url, + url_to_path, +) + +from tests.lib import ( + skip_needs_new_pathname2url_trailing_slash_behavior_win, + skip_needs_new_urlun_behavior_win, + skip_needs_old_pathname2url_trailing_slash_behavior_win, + skip_needs_old_urlun_behavior_win, +) @pytest.mark.skipif("sys.platform == 'win32'") @@ -75,3 +87,185 @@ def test_url_to_path_path_to_url_symmetry_win() -> None: unc_path = r"\\unc\share\path" assert url_to_path(path_to_url(unc_path)) == unc_path + + +@pytest.mark.parametrize( + "path, expected", + [ + # Test a character that needs quoting. + ("a b", "a%20b"), + # Test an unquoted "@". + ("a @ b", "a%20@%20b"), + # Test multiple unquoted "@". + ("a @ @ b", "a%20@%20@%20b"), + # Test a quoted "@". + ("a %40 b", "a%20%40%20b"), + # Test a quoted "@" before an unquoted "@". + ("a %40b@ c", "a%20%40b@%20c"), + # Test a quoted "@" after an unquoted "@". + ("a @b%40 c", "a%20@b%40%20c"), + # Test alternating quoted and unquoted "@". + ("a %40@b %40@c %40", "a%20%40@b%20%40@c%20%40"), + # Test an unquoted "/". + ("a / b", "a%20/%20b"), + # Test multiple unquoted "/". + ("a / / b", "a%20/%20/%20b"), + # Test a quoted "/". + ("a %2F b", "a%20%2F%20b"), + # Test a quoted "/" before an unquoted "/". + ("a %2Fb/ c", "a%20%2Fb/%20c"), + # Test a quoted "/" after an unquoted "/". + ("a /b%2F c", "a%20/b%2F%20c"), + # Test alternating quoted and unquoted "/". + ("a %2F/b %2F/c %2F", "a%20%2F/b%20%2F/c%20%2F"), + # Test normalizing non-reserved quoted characters "[" and "]" + ("a %5b %5d b", "a%20%5B%20%5D%20b"), + # Test normalizing a reserved quoted "/" + ("a %2f b", "a%20%2F%20b"), + ], +) +@pytest.mark.parametrize("is_local_path", [True, False]) +def test_clean_url_path(path: str, expected: str, is_local_path: bool) -> None: + assert _clean_url_path(path, is_local_path=is_local_path) == expected + + +@pytest.mark.parametrize( + "path, expected", + [ + # Test a VCS path with a Windows drive letter and revision. + pytest.param( + "/T:/with space/repo.git@1.0", + "///T:/with%20space/repo.git@1.0", + marks=pytest.mark.skipif("sys.platform != 'win32'"), + ), + # Test a VCS path with a Windows drive letter and revision, + # running on non-windows platform. + pytest.param( + "/T:/with space/repo.git@1.0", + "/T%3A/with%20space/repo.git@1.0", + marks=pytest.mark.skipif("sys.platform == 'win32'"), + ), + ], +) +def test_clean_url_path_with_local_path(path: str, expected: str) -> None: + actual = _clean_url_path(path, is_local_path=True) + assert actual == expected + + +@pytest.mark.parametrize( + "url, expected_url", + [ + # URL with hostname and port. Port separator should not be quoted. + ( + "https://localhost.localdomain:8181/path/with space/", + "https://localhost.localdomain:8181/path/with%20space/", + ), + # URL that is already properly quoted. The quoting `%` + # characters should not be quoted again. + ( + "https://localhost.localdomain:8181/path/with%20quoted%20space/", + "https://localhost.localdomain:8181/path/with%20quoted%20space/", + ), + # URL with IPv4 address and port. + ( + "https://127.0.0.1:8181/path/with space/", + "https://127.0.0.1:8181/path/with%20space/", + ), + # URL with IPv6 address and port. The `[]` brackets around the + # IPv6 address should not be quoted. + ( + "https://[fd00:0:0:236::100]:8181/path/with space/", + "https://[fd00:0:0:236::100]:8181/path/with%20space/", + ), + # URL with query. The leading `?` should not be quoted. + ( + "https://localhost.localdomain:8181/path/with/query?request=test", + "https://localhost.localdomain:8181/path/with/query?request=test", + ), + # URL with colon in the path portion. + ( + "https://localhost.localdomain:8181/path:/with:/colon", + "https://localhost.localdomain:8181/path%3A/with%3A/colon", + ), + # URL with something that looks like a drive letter, but is + # not. The `:` should be quoted. + ( + "https://localhost.localdomain/T:/path/", + "https://localhost.localdomain/T%3A/path/", + ), + # URL with a quoted "/" in the path portion. + ( + "https://example.com/access%2Ftoken/path/", + "https://example.com/access%2Ftoken/path/", + ), + # VCS URL containing revision string. + ( + "git+ssh://example.com/path to/repo.git@1.0#egg=my-package-1.0", + "git+ssh://example.com/path%20to/repo.git@1.0#egg=my-package-1.0", + ), + # VCS URL with a quoted "#" in the revision string. + ( + "git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0", + "git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0", + ), + # VCS URL with a quoted "@" in the revision string. + ( + "git+https://example.com/repo.git@at%40 space#egg=my-package-1.0", + "git+https://example.com/repo.git@at%40%20space#egg=my-package-1.0", + ), + # URL with Windows drive letter. The `:` after the drive + # letter should not be quoted. The trailing `/` should be + # removed. + pytest.param( + "file:///T:/path/with spaces/", + "file:///T:/path/with%20spaces", + marks=[ + skip_needs_old_urlun_behavior_win, + skip_needs_old_pathname2url_trailing_slash_behavior_win, + ], + ), + pytest.param( + "file:///T:/path/with spaces/", + "file://///T:/path/with%20spaces", + marks=[ + skip_needs_new_urlun_behavior_win, + skip_needs_old_pathname2url_trailing_slash_behavior_win, + ], + ), + pytest.param( + "file:///T:/path/with spaces/", + "file://///T:/path/with%20spaces/", + marks=[ + skip_needs_new_urlun_behavior_win, + skip_needs_new_pathname2url_trailing_slash_behavior_win, + ], + ), + # URL with Windows drive letter, running on non-windows + # platform. The `:` after the drive should be quoted. + pytest.param( + "file:///T:/path/with spaces/", + "file:///T%3A/path/with%20spaces/", + marks=pytest.mark.skipif("sys.platform == 'win32'"), + ), + # Test a VCS URL with a Windows drive letter and revision. + pytest.param( + "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", + "git+file:///T:/with%20space/repo.git@1.0#egg=my-package-1.0", + marks=skip_needs_old_urlun_behavior_win, + ), + pytest.param( + "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", + "git+file://///T:/with%20space/repo.git@1.0#egg=my-package-1.0", + marks=skip_needs_new_urlun_behavior_win, + ), + # Test a VCS URL with a Windows drive letter and revision, + # running on non-windows platform. + pytest.param( + "git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0", + "git+file:/T%3A/with%20space/repo.git@1.0#egg=my-package-1.0", + marks=pytest.mark.skipif("sys.platform == 'win32'"), + ), + ], +) +def test_clean_url(url: str, expected_url: str) -> None: + assert clean_url(url) == expected_url