diff --git a/docs/html/user_guide.rst b/docs/html/user_guide.rst index d6a0acf9cd8..3764c14adcd 100644 --- a/docs/html/user_guide.rst +++ b/docs/html/user_guide.rst @@ -46,7 +46,7 @@ directly from distribution files. The most common scenario is to install from `PyPI`_ using :ref:`Requirement -Specifiers` +Specifiers`. .. tab:: Unix/macOS @@ -66,6 +66,16 @@ Specifiers` For more information and examples, see the :ref:`pip install` reference. +.. note:: + + Pip generates a ``User-Agent`` string that includes the running version of Python and pip, along + with further identifying info about the libc and Linux distribution. This string is sent with + every HTTP(S) request pip makes to a remote host. + + Setting ``PIP_TELEMETRY_USER_AGENT_ID`` in the + process environment to any value will instead use exactly that string for the + ``User-Agent`` header. This value is printed as part of the output of :doc:`cli/pip_debug`. + .. _PyPI: https://pypi.org/ .. _`0-basic-authentication-credentials`: diff --git a/news/13560.feature.rst b/news/13560.feature.rst new file mode 100644 index 00000000000..f8814be1a24 --- /dev/null +++ b/news/13560.feature.rst @@ -0,0 +1 @@ +Introduces ``PIP_TELEMETRY_USER_AGENT_ID``, an environment variable which completely overrides the value used to form pip's ``User-Agent`` header. diff --git a/src/pip/_internal/commands/debug.py b/src/pip/_internal/commands/debug.py index 0e187e79c28..ceaa85a0889 100644 --- a/src/pip/_internal/commands/debug.py +++ b/src/pip/_internal/commands/debug.py @@ -18,6 +18,7 @@ from pip._internal.cli.status_codes import SUCCESS from pip._internal.configuration import Configuration from pip._internal.metadata import get_environment +from pip._internal.network.session import Telemetry from pip._internal.utils.compat import open_text_resource from pip._internal.utils.logging import indent_log from pip._internal.utils.misc import get_pip_version @@ -190,6 +191,7 @@ def run(self, options: Values, args: list[str]) -> int: show_value("sys.platform", sys.platform) show_sys_implementation() + show_value("User-Agent header", Telemetry.user_agent_id()) show_value("'cert' config value", ca_bundle_info(self.parser.config)) show_value("REQUESTS_CA_BUNDLE", os.environ.get("REQUESTS_CA_BUNDLE")) show_value("CURL_CA_BUNDLE", os.environ.get("CURL_CA_BUNDLE")) diff --git a/src/pip/_internal/network/session.py b/src/pip/_internal/network/session.py index a1f9444e37b..1bd45627fe5 100644 --- a/src/pip/_internal/network/session.py +++ b/src/pip/_internal/network/session.py @@ -8,20 +8,18 @@ import functools import io import ipaddress -import json import logging import mimetypes import os -import platform -import shutil -import subprocess -import sys +import re import urllib.parse import warnings from collections.abc import Generator, Mapping, Sequence from typing import ( TYPE_CHECKING, Any, + Callable, + ClassVar, Optional, Union, ) @@ -35,20 +33,16 @@ from pip._vendor.urllib3.connectionpool import ConnectionPool from pip._vendor.urllib3.exceptions import InsecureRequestWarning -from pip import __version__ -from pip._internal.metadata import get_default_environment from pip._internal.models.link import Link from pip._internal.network.auth import MultiDomainBasicAuth from pip._internal.network.cache import SafeFileCache - -# Import ssl from compat so the initial import occurs in only one place. -from pip._internal.utils.compat import has_tls -from pip._internal.utils.glibc import libc_ver from pip._internal.utils.misc import build_url_from_netloc, parse_netloc from pip._internal.utils.urls import url_to_path if TYPE_CHECKING: + from collections.abc import Set from ssl import SSLContext + from types import ModuleType from pip._vendor.urllib3.poolmanager import PoolManager from pip._vendor.urllib3.proxymanager import ProxyManager @@ -76,136 +70,275 @@ ] -# These are environment variables present when running under various -# CI systems. For each variable, some CI systems that use the variable -# are indicated. The collection was chosen so that for each of a number -# of popular systems, at least one of the environment variables is used. -# This list is used to provide some indication of and lower bound for -# CI traffic to PyPI. Thus, it is okay if the list is not comprehensive. -# For more background, see: https://github.com/pypa/pip/issues/5499 -CI_ENVIRONMENT_VARIABLES = ( - # Azure Pipelines - "BUILD_BUILDID", - # Jenkins - "BUILD_ID", - # AppVeyor, CircleCI, Codeship, Gitlab CI, Shippable, Travis CI - "CI", - # Explicit environment variable. - "PIP_IS_CI", -) +class Telemetry: + """Return a string representing the user agent. + + FIXME: This string is currently propagated as a header into every single HTTP + request pip makes. It should really be subject to a formal PEP process + in order to ensure user consent around telemetry is respected. + + The current implementation looks very much like the spec for *environment + markers*: (https://packaging.python.org/en/latest/specifications/dependency-specifiers/#environment-markers). + + Environment markers were developed to provide a rich semantic structure for the + build environment of a dependency. Dependency resolvers like pip can then + manipulate these logical operators to calculate compatibility relationships, + *without* executing arbitrary code. This idea extends to version strings + themselves as well: the '===' or "arbitrary" operator was + specifically intended as an "escape hatch", for version strings that cannot be + made to conform to standard Python version strings + (https://packaging.python.org/en/latest/specifications/version-specifiers/#arbitrary-equality). + + Environment markers can therefore be viewed as a language for codebases to + communicate their build process and requirements + (https://packaging.python.org/en/latest/flow/#the-packaging-flow) to a generic + resolver. Spack specs + (https://spack.readthedocs.io/en/latest/spec_syntax.html#sec-specs) + are very similar in spirit, and arose precisely to codify the recursive + and conditional relationships across multi-language software stacks. + + Extending this concept, we could consider the *user agent id* as a language for + resolvers to communicate their requirements to external services and + execution environments. For example, clients for the GNU Make Jobserver protocol use + an environment variable to indicate how the jobserver should communicate to them, + and to indicate the maximum bandwidth they can tolerate for parallel execution: + (https://www.gnu.org/software/make/manual/html_node/Job-Slots.html). + + The pants build tool specifically highlights the risk of exposing proprietary + information through thoughtless telemetry (https://www.pantsbuild.org/stable/docs/using-pants/anonymous-telemetry). + As a result, not only do they explicitly specify the information being recorded + (https://www.pantsbuild.org/stable/docs/using-pants/anonymous-telemetry#what-data-is-sent), + but they additionally incorporate anonymity as an explicit design goal + (https://www.pantsbuild.org/stable/docs/using-pants/anonymous-telemetry#how-we-ensure-anonymity). + + Negotiating standards around useful telemetry data for PyPI began here + (https://github.com/pypa/pip/issues/5499), but never became a full PEP. This commit + (https://github.com/pypa/pip/commit/f787788a65cf7a8af8b7ef9dc13c0681d94fff5f) added + the output of an arbitrary subprocess execution into the string pip attaches to + every single HTTP request. + + `PIP_USER_AGENT_USER_DATA` is mentioned in the docs once to *add* identifying info + to the output, in the context of a proxy server + (https://pip.pypa.io/en/stable/user_guide/#using-a-proxy-server). + + We now introduce `PIP_TELEMETRY_USER_AGENT_ID`, which will completely overwrite the + string transmitted to remote hosts that pip communicates it. To maintain backwards + compatibility, it is disabled by default, but can be set to the empty string or any + other value. + """ + @staticmethod + @functools.cache + def user_agent_id() -> str: + return Telemetry.calculate_user_agent_id(os.environ) + + CLOBBER_USER_AGENT_ENV_VAR: ClassVar[str] = "PIP_TELEMETRY_USER_AGENT_ID" + + @staticmethod + def calculate_user_agent_id(env: Mapping[str, str]) -> str: + # If the clobber variable is set, that's the only thing anyone sees. + if Telemetry.CLOBBER_USER_AGENT_ENV_VAR in env: + return env[Telemetry.CLOBBER_USER_AGENT_ENV_VAR] + + # Otherwise, muster all the imports and identifying info necessary to construct + # the legacy unspecified user-agent format. + import json + import platform + import sys + + return Telemetry.calculate_legacy_user_agent_id( + sys, + platform, + json, + Telemetry.pip_version, + Telemetry.linux_distribution, + Telemetry.libc_ver, + Telemetry.openssl_version, + Telemetry.setuptools_version, + Telemetry.rustc_process_execution, + env, + ) -def looks_like_ci() -> bool: - """ - Return whether it looks like pip is running under CI. - """ - # We don't use the method of checking for a tty (e.g. using isatty()) - # because some CI systems mimic a tty (e.g. Travis CI). Thus that - # method doesn't provide definitive information in either direction. - return any(name in os.environ for name in CI_ENVIRONMENT_VARIABLES) + _rustc_output_regex: ClassVar[re.Pattern[str]] = re.compile(r"^rustc ([^\s]+)") + @staticmethod + def rustc_process_execution() -> str | None: + import shutil + import subprocess -@functools.lru_cache(maxsize=1) -def user_agent() -> str: - """ - Return a string representing the user agent. - """ - data: dict[str, Any] = { - "installer": {"name": "pip", "version": __version__}, - "python": platform.python_version(), - "implementation": { - "name": platform.python_implementation(), - }, - } - - if data["implementation"]["name"] == "CPython": - data["implementation"]["version"] = platform.python_version() - elif data["implementation"]["name"] == "PyPy": - pypy_version_info = sys.pypy_version_info # type: ignore - if pypy_version_info.releaselevel == "final": - pypy_version_info = pypy_version_info[:3] - data["implementation"]["version"] = ".".join( - [str(x) for x in pypy_version_info] - ) - elif data["implementation"]["name"] == "Jython": - # Complete Guess - data["implementation"]["version"] = platform.python_version() - elif data["implementation"]["name"] == "IronPython": - # Complete Guess - data["implementation"]["version"] = platform.python_version() - - if sys.platform.startswith("linux"): + if rustc := shutil.which("rustc"): + try: + rustc_output = subprocess.check_output( + [rustc, "--version"], + stderr=subprocess.STDOUT, + timeout=0.1, + encoding="utf-8", + ) + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + return None + else: + if m := Telemetry._rustc_output_regex.match(rustc_output): + return m.group(1) + return None + return None + + @staticmethod + def libc_ver() -> tuple[str, str]: + from pip._internal.utils.glibc import libc_ver + + return libc_ver() + + @staticmethod + def openssl_version() -> str | None: + from pip._internal.utils.compat import has_tls + + if not has_tls(): + return None + import _ssl as ssl + + return ssl.OPENSSL_VERSION + + @staticmethod + def setuptools_version() -> str | None: + from pip._internal.metadata import get_default_environment + + setuptools_dist = get_default_environment().get_distribution("setuptools") + if setuptools_dist is None: + return None + return str(setuptools_dist.version) + + @staticmethod + def linux_distribution() -> tuple[str, str, str]: from pip._vendor import distro - linux_distribution = distro.name(), distro.version(), distro.codename() - distro_infos: dict[str, Any] = dict( - filter( - lambda x: x[1], - zip(["name", "version", "id"], linux_distribution), + return distro.name(), distro.version(), distro.codename() + + @staticmethod + def pip_version() -> str: + from pip import __version__ + + return __version__ + + @staticmethod + def calculate_legacy_user_agent_id( + sys: ModuleType, + platform: ModuleType, + json: ModuleType, + pip_version: Callable[[], str], + linux_distribution: Callable[[], tuple[str, str, str]], + libc_ver: Callable[[], tuple[str, str]], + openssl_version: Callable[[], str | None], + setuptools_version: Callable[[], str | None], + rustc_version: Callable[[], str | None], + env: Mapping[str, str], + ) -> str: + data: dict[str, Any] = { + "installer": {"name": "pip", "version": pip_version()}, + "python": platform.python_version(), + "implementation": { + "name": platform.python_implementation(), + }, + } + + if data["implementation"]["name"] == "CPython": + data["implementation"]["version"] = platform.python_version() + elif data["implementation"]["name"] == "PyPy": + pypy_version_info = sys.pypy_version_info + if pypy_version_info.releaselevel == "final": + pypy_version_info = pypy_version_info[:3] + data["implementation"]["version"] = ".".join( + [str(x) for x in pypy_version_info] ) - ) - libc = dict( - filter( - lambda x: x[1], - zip(["lib", "version"], libc_ver()), + elif data["implementation"]["name"] == "Jython": + # Complete Guess + data["implementation"]["version"] = platform.python_version() + elif data["implementation"]["name"] == "IronPython": + # Complete Guess + data["implementation"]["version"] = platform.python_version() + + if sys.platform.startswith("linux"): + distro_infos: dict[str, Any] = dict( + filter( + lambda x: x[1], + zip(["name", "version", "id"], linux_distribution()), + ) ) - ) - if libc: - distro_infos["libc"] = libc - if distro_infos: - data["distro"] = distro_infos + libc = dict( + filter( + lambda x: x[1], + zip(["lib", "version"], libc_ver()), + ) + ) + if libc: + distro_infos["libc"] = libc + if distro_infos: + data["distro"] = distro_infos - if sys.platform.startswith("darwin") and platform.mac_ver()[0]: - data["distro"] = {"name": "macOS", "version": platform.mac_ver()[0]} + if sys.platform.startswith("darwin") and platform.mac_ver()[0]: + data["distro"] = {"name": "macOS", "version": platform.mac_ver()[0]} - if platform.system(): - data.setdefault("system", {})["name"] = platform.system() + if platform.system(): + data.setdefault("system", {})["name"] = platform.system() - if platform.release(): - data.setdefault("system", {})["release"] = platform.release() + if platform.release(): + data.setdefault("system", {})["release"] = platform.release() - if platform.machine(): - data["cpu"] = platform.machine() + if platform.machine(): + data["cpu"] = platform.machine() - if has_tls(): - import _ssl as ssl + if (ssl_ver := openssl_version()) is not None: + data["openssl_version"] = ssl_ver - data["openssl_version"] = ssl.OPENSSL_VERSION + if (setuptools_ver := setuptools_version()) is not None: + data["setuptools_version"] = setuptools_ver - setuptools_dist = get_default_environment().get_distribution("setuptools") - if setuptools_dist is not None: - data["setuptools_version"] = str(setuptools_dist.version) + if (rustc_ver := rustc_version()) is not None: + data["rustc_version"] = rustc_ver - if shutil.which("rustc") is not None: - # If for any reason `rustc --version` fails, silently ignore it - try: - rustc_output = subprocess.check_output( - ["rustc", "--version"], stderr=subprocess.STDOUT, timeout=0.5 - ) - except Exception: - pass - else: - if rustc_output.startswith(b"rustc "): - # The format of `rustc --version` is: - # `b'rustc 1.52.1 (9bc8c42bb 2021-05-09)\n'` - # We extract just the middle (1.52.1) part - data["rustc_version"] = rustc_output.split(b" ")[1].decode() - - # Use None rather than False so as not to give the impression that - # pip knows it is not being run under CI. Rather, it is a null or - # inconclusive result. Also, we include some value rather than no - # value to make it easier to know that the check has been run. - data["ci"] = True if looks_like_ci() else None - - user_data = os.environ.get("PIP_USER_AGENT_USER_DATA") - if user_data is not None: - data["user_data"] = user_data - - return "{data[installer][name]}/{data[installer][version]} {json}".format( - data=data, - json=json.dumps(data, separators=(",", ":"), sort_keys=True), + # Use None rather than False so as not to give the impression that + # pip knows it is not being run under CI. Rather, it is a null or + # inconclusive result. Also, we include some value rather than no + # value to make it easier to know that the check has been run. + data["ci"] = ( + True if Telemetry.has_known_ci_sentinel(frozenset(env.keys())) else None + ) + + if (user_data := env.get("PIP_USER_AGENT_USER_DATA")) is not None: + data["user_data"] = user_data + + return "{data[installer][name]}/{data[installer][version]} {json}".format( + data=data, + json=json.dumps(data, separators=(",", ":"), sort_keys=True), + ) + + # These are environment variables present when running under various + # CI systems. For each variable, some CI systems that use the variable + # are indicated. The collection was chosen so that for each of a number + # of popular systems, at least one of the environment variables is used. + # This list is used to provide some indication of and lower bound for + # CI traffic to PyPI. Thus, it is okay if the list is not comprehensive. + # For more background, see: https://github.com/pypa/pip/issues/5499 + KNOWN_CI_SENTINEL_VARIABLES: ClassVar[tuple[str, ...]] = ( + # Azure Pipelines + "BUILD_BUILDID", + # Jenkins + "BUILD_ID", + # AppVeyor, CircleCI, Codeship, Gitlab CI, Shippable, Travis CI + "CI", + # Explicit environment variable. + "PIP_IS_CI", ) + @staticmethod + def has_known_ci_sentinel(env: Set[str]) -> bool: + """ + Return whether it looks like pip is running under CI. + """ + # We don't use the method of checking for a tty (e.g. using isatty()) + # because some CI systems mimic a tty (e.g. Travis CI). Thus that + # method doesn't provide definitive information in either direction. + return any(name in env for name in Telemetry.KNOWN_CI_SENTINEL_VARIABLES) + class LocalFSAdapter(BaseAdapter): def send( @@ -347,7 +480,7 @@ def __init__( self.pip_proxy = None # Attach our User Agent to the request - self.headers["User-Agent"] = user_agent() + self.headers["User-Agent"] = Telemetry.user_agent_id() # Attach our Authentication handler to the session self.auth = MultiDomainBasicAuth(index_urls=index_urls) diff --git a/tests/unit/test_network_session.py b/tests/unit/test_network_session.py index b48be71fc51..7c6e6806525 100644 --- a/tests/unit/test_network_session.py +++ b/tests/unit/test_network_session.py @@ -14,16 +14,15 @@ from pip import __version__ from pip._internal.models.link import Link from pip._internal.network.session import ( - CI_ENVIRONMENT_VARIABLES, PipSession, - user_agent, + Telemetry, ) def get_user_agent() -> str: # These tests are testing the computation of the user agent, so we want to # avoid reusing cached values. - user_agent.cache_clear() + Telemetry.user_agent_id.cache_clear() return PipSession().headers["User-Agent"] @@ -49,10 +48,10 @@ def test_user_agent__ci( ) -> None: # Delete the variable names we use to check for CI to prevent the # detection from always returning True in case the tests are being run - # under actual CI. It is okay to depend on CI_ENVIRONMENT_VARIABLES + # under actual CI. It is okay to depend on KNOWN_CI_SENTINEL_VARIABLES # here (part of the code under test) because this setup step can only # prevent false test failures. It can't cause a false test passage. - for ci_name in CI_ENVIRONMENT_VARIABLES: + for ci_name in Telemetry.KNOWN_CI_SENTINEL_VARIABLES: monkeypatch.delenv(ci_name, raising=False) # Confirm the baseline before setting the environment variable. @@ -71,6 +70,11 @@ def test_user_agent_user_data(monkeypatch: pytest.MonkeyPatch) -> None: assert "some_string" in get_user_agent() +def test_clobber_user_agent(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv(Telemetry.CLOBBER_USER_AGENT_ENV_VAR, "some_string") + assert "some_string" == get_user_agent() + + class TestPipSession: def test_cache_defaults_off(self) -> None: session = PipSession()