Skip to content

Promote _ensure_quoted_url() to a shared utility function #13498

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
70 changes: 3 additions & 67 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

import functools
import itertools
import logging
import os
import posixpath
Expand All @@ -19,12 +18,11 @@
from pip._internal.utils.filetypes import WHEEL_EXTENSION
from pip._internal.utils.hashes import Hashes
from pip._internal.utils.misc import (
pairwise,
redact_auth_from_url,
split_auth_from_netloc,
splitext,
)
from pip._internal.utils.urls import path_to_url, url_to_path
from pip._internal.utils.urls import clean_url, path_to_url, url_to_path

if TYPE_CHECKING:
from pip._internal.index.collector import IndexContent
Expand Down Expand Up @@ -113,68 +111,6 @@ def supported_hashes(hashes: dict[str, str] | None) -> dict[str, str] | None:
return hashes


def _clean_url_path_part(part: str) -> str:
"""
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
return urllib.parse.quote(urllib.parse.unquote(part))


def _clean_file_url_path(part: str) -> str:
"""
Clean the first part of a URL path that corresponds to a local
filesystem path (i.e. the first part after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
# Also, on Windows the path part might contain a drive letter which
# should not be quoted. On Linux where drive letters do not
# exist, the colon should be quoted. We rely on urllib.request
# to do the right thing here.
return urllib.request.pathname2url(urllib.request.url2pathname(part))


# percent-encoded: /
_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)


def _clean_url_path(path: str, is_local_path: bool) -> str:
"""
Clean the path portion of a URL.
"""
if is_local_path:
clean_func = _clean_file_url_path
else:
clean_func = _clean_url_path_part

# Split on the reserved characters prior to cleaning so that
# revision strings in VCS URLs are properly preserved.
parts = _reserved_chars_re.split(path)

cleaned_parts = []
for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
cleaned_parts.append(clean_func(to_clean))
# Normalize %xx escapes (e.g. %2f -> %2F)
cleaned_parts.append(reserved.upper())

return "".join(cleaned_parts)


def _ensure_quoted_url(url: str) -> str:
"""
Make sure a link is fully quoted.
For example, if ' ' occurs in the URL, it will be replaced with "%20",
and without double-quoting other characters.
"""
# Split the URL into parts according to the general structure
# `scheme://netloc/path?query#fragment`.
result = urllib.parse.urlsplit(url)
# If the netloc is empty, then the URL refers to a local filesystem path.
is_local_path = not result.netloc
path = _clean_url_path(result.path, is_local_path=is_local_path)
return urllib.parse.urlunsplit(result._replace(path=path))


def _absolute_link_url(base_url: str, url: str) -> str:
"""
A faster implementation of urllib.parse.urljoin with a shortcut
Expand Down Expand Up @@ -281,7 +217,7 @@ def from_json(
if file_url is None:
return None

url = _ensure_quoted_url(_absolute_link_url(page_url, file_url))
url = clean_url(_absolute_link_url(page_url, file_url))
pyrequire = file_data.get("requires-python")
yanked_reason = file_data.get("yanked")
hashes = file_data.get("hashes", {})
Expand Down Expand Up @@ -333,7 +269,7 @@ def from_element(
if not href:
return None

url = _ensure_quoted_url(_absolute_link_url(base_url, href))
url = clean_url(_absolute_link_url(base_url, href))
pyrequire = anchor_attribs.get("data-requires-python")
yanked_reason = anchor_attribs.get("data-yanked")

Expand Down
66 changes: 66 additions & 0 deletions src/pip/_internal/utils/urls.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import itertools
import os
import re
import string
import urllib.parse
import urllib.request

from pip._internal.utils.misc import pairwise

from .compat import WINDOWS


Expand Down Expand Up @@ -53,3 +57,65 @@ def url_to_path(url: str) -> str:
path = path[1:]

return path


def _clean_url_path_part(part: str) -> str:
"""
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
return urllib.parse.quote(urllib.parse.unquote(part))


def _clean_file_url_path(part: str) -> str:
"""
Clean the first part of a URL path that corresponds to a local
filesystem path (i.e. the first part after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
# Also, on Windows the path part might contain a drive letter which
# should not be quoted. On Linux where drive letters do not
# exist, the colon should be quoted. We rely on urllib.request
# to do the right thing here.
return urllib.request.pathname2url(urllib.request.url2pathname(part))


# percent-encoded: /
_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)


def _clean_url_path(path: str, is_local_path: bool) -> str:
"""
Clean the path portion of a URL.
"""
if is_local_path:
clean_func = _clean_file_url_path
else:
clean_func = _clean_url_path_part

# Split on the reserved characters prior to cleaning so that
# revision strings in VCS URLs are properly preserved.
parts = _reserved_chars_re.split(path)

cleaned_parts = []
for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
cleaned_parts.append(clean_func(to_clean))
# Normalize %xx escapes (e.g. %2f -> %2F)
cleaned_parts.append(reserved.upper())

return "".join(cleaned_parts)


def clean_url(url: str) -> str:
"""
Make sure a link is fully quoted.
For example, if ' ' occurs in the URL, it will be replaced with "%20",
and without double-quoting other characters.
"""
# Split the URL into parts according to the general structure
# `scheme://netloc/path?query#fragment`.
result = urllib.parse.urlsplit(url)
# If the netloc is empty, then the URL refers to a local filesystem path.
is_local_path = not result.netloc
path = _clean_url_path(result.path, is_local_path=is_local_path)
return urllib.parse.urlunsplit(result._replace(path=path))
19 changes: 3 additions & 16 deletions src/pip/_internal/vcs/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
import os.path
import pathlib
import re
import urllib.parse
import urllib.request
from dataclasses import replace
from typing import Any

from pip._internal.exceptions import BadCommand, InstallationError
from pip._internal.utils.misc import HiddenText, display_path, hide_url
from pip._internal.utils.subprocess import make_command
from pip._internal.utils.urls import clean_url
from pip._internal.vcs.versioncontrol import (
AuthInfo,
RemoteNotFoundError,
Expand All @@ -22,10 +21,6 @@
vcs,
)

urlsplit = urllib.parse.urlsplit
urlunsplit = urllib.parse.urlunsplit


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -502,16 +497,8 @@ def get_url_rev_and_auth(cls, url: str) -> tuple[str, str | None, AuthInfo]:
"""
# Works around an apparent Git bug
# (see https://article.gmane.org/gmane.comp.version-control.git/146500)
scheme, netloc, path, query, fragment = urlsplit(url)
if scheme.endswith("file"):
initial_slashes = path[: -len(path.lstrip("/"))]
newpath = initial_slashes + urllib.request.url2pathname(path).replace(
"\\", "/"
).lstrip("/")
after_plus = scheme.find("+") + 1
url = scheme[:after_plus] + urlunsplit(
(scheme[after_plus:], netloc, newpath, query, fragment),
)
if url.startswith("git+file:"):
url = "git+" + clean_url(url[4:])

if "://" not in url:
assert "file:" not in url
Expand Down
Loading
Loading