diff --git a/docs/html/user_guide.rst b/docs/html/user_guide.rst
index d6a0acf9cd8..3764c14adcd 100644
--- a/docs/html/user_guide.rst
+++ b/docs/html/user_guide.rst
@@ -46,7 +46,7 @@ directly from distribution files.
The most common scenario is to install from `PyPI`_ using :ref:`Requirement
-Specifiers`
+Specifiers`.
.. tab:: Unix/macOS
@@ -66,6 +66,16 @@ Specifiers`
For more information and examples, see the :ref:`pip install` reference.
+.. note::
+
+ Pip generates a ``User-Agent`` string that includes the running version of Python and pip, along
+ with further identifying info about the libc and Linux distribution. This string is sent with
+ every HTTP(S) request pip makes to a remote host.
+
+ Setting ``PIP_TELEMETRY_USER_AGENT_ID`` in the
+ process environment to any value will instead use exactly that string for the
+ ``User-Agent`` header. This value is printed as part of the output of :doc:`cli/pip_debug`.
+
.. _PyPI: https://pypi.org/
.. _`0-basic-authentication-credentials`:
diff --git a/news/13560.feature.rst b/news/13560.feature.rst
new file mode 100644
index 00000000000..f8814be1a24
--- /dev/null
+++ b/news/13560.feature.rst
@@ -0,0 +1 @@
+Introduces ``PIP_TELEMETRY_USER_AGENT_ID``, an environment variable which completely overrides the value used to form pip's ``User-Agent`` header.
diff --git a/src/pip/_internal/commands/debug.py b/src/pip/_internal/commands/debug.py
index 0e187e79c28..ceaa85a0889 100644
--- a/src/pip/_internal/commands/debug.py
+++ b/src/pip/_internal/commands/debug.py
@@ -18,6 +18,7 @@
from pip._internal.cli.status_codes import SUCCESS
from pip._internal.configuration import Configuration
from pip._internal.metadata import get_environment
+from pip._internal.network.session import Telemetry
from pip._internal.utils.compat import open_text_resource
from pip._internal.utils.logging import indent_log
from pip._internal.utils.misc import get_pip_version
@@ -190,6 +191,7 @@ def run(self, options: Values, args: list[str]) -> int:
show_value("sys.platform", sys.platform)
show_sys_implementation()
+ show_value("User-Agent header", Telemetry.user_agent_id())
show_value("'cert' config value", ca_bundle_info(self.parser.config))
show_value("REQUESTS_CA_BUNDLE", os.environ.get("REQUESTS_CA_BUNDLE"))
show_value("CURL_CA_BUNDLE", os.environ.get("CURL_CA_BUNDLE"))
diff --git a/src/pip/_internal/network/session.py b/src/pip/_internal/network/session.py
index a1f9444e37b..1bd45627fe5 100644
--- a/src/pip/_internal/network/session.py
+++ b/src/pip/_internal/network/session.py
@@ -8,20 +8,18 @@
import functools
import io
import ipaddress
-import json
import logging
import mimetypes
import os
-import platform
-import shutil
-import subprocess
-import sys
+import re
import urllib.parse
import warnings
from collections.abc import Generator, Mapping, Sequence
from typing import (
TYPE_CHECKING,
Any,
+ Callable,
+ ClassVar,
Optional,
Union,
)
@@ -35,20 +33,16 @@
from pip._vendor.urllib3.connectionpool import ConnectionPool
from pip._vendor.urllib3.exceptions import InsecureRequestWarning
-from pip import __version__
-from pip._internal.metadata import get_default_environment
from pip._internal.models.link import Link
from pip._internal.network.auth import MultiDomainBasicAuth
from pip._internal.network.cache import SafeFileCache
-
-# Import ssl from compat so the initial import occurs in only one place.
-from pip._internal.utils.compat import has_tls
-from pip._internal.utils.glibc import libc_ver
from pip._internal.utils.misc import build_url_from_netloc, parse_netloc
from pip._internal.utils.urls import url_to_path
if TYPE_CHECKING:
+ from collections.abc import Set
from ssl import SSLContext
+ from types import ModuleType
from pip._vendor.urllib3.poolmanager import PoolManager
from pip._vendor.urllib3.proxymanager import ProxyManager
@@ -76,136 +70,275 @@
]
-# These are environment variables present when running under various
-# CI systems. For each variable, some CI systems that use the variable
-# are indicated. The collection was chosen so that for each of a number
-# of popular systems, at least one of the environment variables is used.
-# This list is used to provide some indication of and lower bound for
-# CI traffic to PyPI. Thus, it is okay if the list is not comprehensive.
-# For more background, see: https://github.com/pypa/pip/issues/5499
-CI_ENVIRONMENT_VARIABLES = (
- # Azure Pipelines
- "BUILD_BUILDID",
- # Jenkins
- "BUILD_ID",
- # AppVeyor, CircleCI, Codeship, Gitlab CI, Shippable, Travis CI
- "CI",
- # Explicit environment variable.
- "PIP_IS_CI",
-)
+class Telemetry:
+ """Return a string representing the user agent.
+
+ FIXME: This string is currently propagated as a header into every single HTTP
+ request pip makes. It should really be subject to a formal PEP process
+ in order to ensure user consent around telemetry is respected.
+
+ The current implementation looks very much like the spec for *environment
+ markers*: (https://packaging.python.org/en/latest/specifications/dependency-specifiers/#environment-markers).
+
+ Environment markers were developed to provide a rich semantic structure for the
+ build environment of a dependency. Dependency resolvers like pip can then
+ manipulate these logical operators to calculate compatibility relationships,
+ *without* executing arbitrary code. This idea extends to version strings
+ themselves as well: the '===' or "arbitrary" operator was
+ specifically intended as an "escape hatch", for version strings that cannot be
+ made to conform to standard Python version strings
+ (https://packaging.python.org/en/latest/specifications/version-specifiers/#arbitrary-equality).
+
+ Environment markers can therefore be viewed as a language for codebases to
+ communicate their build process and requirements
+ (https://packaging.python.org/en/latest/flow/#the-packaging-flow) to a generic
+ resolver. Spack specs
+ (https://spack.readthedocs.io/en/latest/spec_syntax.html#sec-specs)
+ are very similar in spirit, and arose precisely to codify the recursive
+ and conditional relationships across multi-language software stacks.
+
+ Extending this concept, we could consider the *user agent id* as a language for
+ resolvers to communicate their requirements to external services and
+ execution environments. For example, clients for the GNU Make Jobserver protocol use
+ an environment variable to indicate how the jobserver should communicate to them,
+ and to indicate the maximum bandwidth they can tolerate for parallel execution:
+ (https://www.gnu.org/software/make/manual/html_node/Job-Slots.html).
+
+ The pants build tool specifically highlights the risk of exposing proprietary
+ information through thoughtless telemetry (https://www.pantsbuild.org/stable/docs/using-pants/anonymous-telemetry).
+ As a result, not only do they explicitly specify the information being recorded
+ (https://www.pantsbuild.org/stable/docs/using-pants/anonymous-telemetry#what-data-is-sent),
+ but they additionally incorporate anonymity as an explicit design goal
+ (https://www.pantsbuild.org/stable/docs/using-pants/anonymous-telemetry#how-we-ensure-anonymity).
+
+ Negotiating standards around useful telemetry data for PyPI began here
+ (https://github.com/pypa/pip/issues/5499), but never became a full PEP. This commit
+ (https://github.com/pypa/pip/commit/f787788a65cf7a8af8b7ef9dc13c0681d94fff5f) added
+ the output of an arbitrary subprocess execution into the string pip attaches to
+ every single HTTP request.
+
+ `PIP_USER_AGENT_USER_DATA` is mentioned in the docs once to *add* identifying info
+ to the output, in the context of a proxy server
+ (https://pip.pypa.io/en/stable/user_guide/#using-a-proxy-server).
+
+ We now introduce `PIP_TELEMETRY_USER_AGENT_ID`, which will completely overwrite the
+ string transmitted to remote hosts that pip communicates it. To maintain backwards
+ compatibility, it is disabled by default, but can be set to the empty string or any
+ other value.
+ """
+ @staticmethod
+ @functools.cache
+ def user_agent_id() -> str:
+ return Telemetry.calculate_user_agent_id(os.environ)
+
+ CLOBBER_USER_AGENT_ENV_VAR: ClassVar[str] = "PIP_TELEMETRY_USER_AGENT_ID"
+
+ @staticmethod
+ def calculate_user_agent_id(env: Mapping[str, str]) -> str:
+ # If the clobber variable is set, that's the only thing anyone sees.
+ if Telemetry.CLOBBER_USER_AGENT_ENV_VAR in env:
+ return env[Telemetry.CLOBBER_USER_AGENT_ENV_VAR]
+
+ # Otherwise, muster all the imports and identifying info necessary to construct
+ # the legacy unspecified user-agent format.
+ import json
+ import platform
+ import sys
+
+ return Telemetry.calculate_legacy_user_agent_id(
+ sys,
+ platform,
+ json,
+ Telemetry.pip_version,
+ Telemetry.linux_distribution,
+ Telemetry.libc_ver,
+ Telemetry.openssl_version,
+ Telemetry.setuptools_version,
+ Telemetry.rustc_process_execution,
+ env,
+ )
-def looks_like_ci() -> bool:
- """
- Return whether it looks like pip is running under CI.
- """
- # We don't use the method of checking for a tty (e.g. using isatty())
- # because some CI systems mimic a tty (e.g. Travis CI). Thus that
- # method doesn't provide definitive information in either direction.
- return any(name in os.environ for name in CI_ENVIRONMENT_VARIABLES)
+ _rustc_output_regex: ClassVar[re.Pattern[str]] = re.compile(r"^rustc ([^\s]+)")
+ @staticmethod
+ def rustc_process_execution() -> str | None:
+ import shutil
+ import subprocess
-@functools.lru_cache(maxsize=1)
-def user_agent() -> str:
- """
- Return a string representing the user agent.
- """
- data: dict[str, Any] = {
- "installer": {"name": "pip", "version": __version__},
- "python": platform.python_version(),
- "implementation": {
- "name": platform.python_implementation(),
- },
- }
-
- if data["implementation"]["name"] == "CPython":
- data["implementation"]["version"] = platform.python_version()
- elif data["implementation"]["name"] == "PyPy":
- pypy_version_info = sys.pypy_version_info # type: ignore
- if pypy_version_info.releaselevel == "final":
- pypy_version_info = pypy_version_info[:3]
- data["implementation"]["version"] = ".".join(
- [str(x) for x in pypy_version_info]
- )
- elif data["implementation"]["name"] == "Jython":
- # Complete Guess
- data["implementation"]["version"] = platform.python_version()
- elif data["implementation"]["name"] == "IronPython":
- # Complete Guess
- data["implementation"]["version"] = platform.python_version()
-
- if sys.platform.startswith("linux"):
+ if rustc := shutil.which("rustc"):
+ try:
+ rustc_output = subprocess.check_output(
+ [rustc, "--version"],
+ stderr=subprocess.STDOUT,
+ timeout=0.1,
+ encoding="utf-8",
+ )
+ except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
+ return None
+ else:
+ if m := Telemetry._rustc_output_regex.match(rustc_output):
+ return m.group(1)
+ return None
+ return None
+
+ @staticmethod
+ def libc_ver() -> tuple[str, str]:
+ from pip._internal.utils.glibc import libc_ver
+
+ return libc_ver()
+
+ @staticmethod
+ def openssl_version() -> str | None:
+ from pip._internal.utils.compat import has_tls
+
+ if not has_tls():
+ return None
+ import _ssl as ssl
+
+ return ssl.OPENSSL_VERSION
+
+ @staticmethod
+ def setuptools_version() -> str | None:
+ from pip._internal.metadata import get_default_environment
+
+ setuptools_dist = get_default_environment().get_distribution("setuptools")
+ if setuptools_dist is None:
+ return None
+ return str(setuptools_dist.version)
+
+ @staticmethod
+ def linux_distribution() -> tuple[str, str, str]:
from pip._vendor import distro
- linux_distribution = distro.name(), distro.version(), distro.codename()
- distro_infos: dict[str, Any] = dict(
- filter(
- lambda x: x[1],
- zip(["name", "version", "id"], linux_distribution),
+ return distro.name(), distro.version(), distro.codename()
+
+ @staticmethod
+ def pip_version() -> str:
+ from pip import __version__
+
+ return __version__
+
+ @staticmethod
+ def calculate_legacy_user_agent_id(
+ sys: ModuleType,
+ platform: ModuleType,
+ json: ModuleType,
+ pip_version: Callable[[], str],
+ linux_distribution: Callable[[], tuple[str, str, str]],
+ libc_ver: Callable[[], tuple[str, str]],
+ openssl_version: Callable[[], str | None],
+ setuptools_version: Callable[[], str | None],
+ rustc_version: Callable[[], str | None],
+ env: Mapping[str, str],
+ ) -> str:
+ data: dict[str, Any] = {
+ "installer": {"name": "pip", "version": pip_version()},
+ "python": platform.python_version(),
+ "implementation": {
+ "name": platform.python_implementation(),
+ },
+ }
+
+ if data["implementation"]["name"] == "CPython":
+ data["implementation"]["version"] = platform.python_version()
+ elif data["implementation"]["name"] == "PyPy":
+ pypy_version_info = sys.pypy_version_info
+ if pypy_version_info.releaselevel == "final":
+ pypy_version_info = pypy_version_info[:3]
+ data["implementation"]["version"] = ".".join(
+ [str(x) for x in pypy_version_info]
)
- )
- libc = dict(
- filter(
- lambda x: x[1],
- zip(["lib", "version"], libc_ver()),
+ elif data["implementation"]["name"] == "Jython":
+ # Complete Guess
+ data["implementation"]["version"] = platform.python_version()
+ elif data["implementation"]["name"] == "IronPython":
+ # Complete Guess
+ data["implementation"]["version"] = platform.python_version()
+
+ if sys.platform.startswith("linux"):
+ distro_infos: dict[str, Any] = dict(
+ filter(
+ lambda x: x[1],
+ zip(["name", "version", "id"], linux_distribution()),
+ )
)
- )
- if libc:
- distro_infos["libc"] = libc
- if distro_infos:
- data["distro"] = distro_infos
+ libc = dict(
+ filter(
+ lambda x: x[1],
+ zip(["lib", "version"], libc_ver()),
+ )
+ )
+ if libc:
+ distro_infos["libc"] = libc
+ if distro_infos:
+ data["distro"] = distro_infos
- if sys.platform.startswith("darwin") and platform.mac_ver()[0]:
- data["distro"] = {"name": "macOS", "version": platform.mac_ver()[0]}
+ if sys.platform.startswith("darwin") and platform.mac_ver()[0]:
+ data["distro"] = {"name": "macOS", "version": platform.mac_ver()[0]}
- if platform.system():
- data.setdefault("system", {})["name"] = platform.system()
+ if platform.system():
+ data.setdefault("system", {})["name"] = platform.system()
- if platform.release():
- data.setdefault("system", {})["release"] = platform.release()
+ if platform.release():
+ data.setdefault("system", {})["release"] = platform.release()
- if platform.machine():
- data["cpu"] = platform.machine()
+ if platform.machine():
+ data["cpu"] = platform.machine()
- if has_tls():
- import _ssl as ssl
+ if (ssl_ver := openssl_version()) is not None:
+ data["openssl_version"] = ssl_ver
- data["openssl_version"] = ssl.OPENSSL_VERSION
+ if (setuptools_ver := setuptools_version()) is not None:
+ data["setuptools_version"] = setuptools_ver
- setuptools_dist = get_default_environment().get_distribution("setuptools")
- if setuptools_dist is not None:
- data["setuptools_version"] = str(setuptools_dist.version)
+ if (rustc_ver := rustc_version()) is not None:
+ data["rustc_version"] = rustc_ver
- if shutil.which("rustc") is not None:
- # If for any reason `rustc --version` fails, silently ignore it
- try:
- rustc_output = subprocess.check_output(
- ["rustc", "--version"], stderr=subprocess.STDOUT, timeout=0.5
- )
- except Exception:
- pass
- else:
- if rustc_output.startswith(b"rustc "):
- # The format of `rustc --version` is:
- # `b'rustc 1.52.1 (9bc8c42bb 2021-05-09)\n'`
- # We extract just the middle (1.52.1) part
- data["rustc_version"] = rustc_output.split(b" ")[1].decode()
-
- # Use None rather than False so as not to give the impression that
- # pip knows it is not being run under CI. Rather, it is a null or
- # inconclusive result. Also, we include some value rather than no
- # value to make it easier to know that the check has been run.
- data["ci"] = True if looks_like_ci() else None
-
- user_data = os.environ.get("PIP_USER_AGENT_USER_DATA")
- if user_data is not None:
- data["user_data"] = user_data
-
- return "{data[installer][name]}/{data[installer][version]} {json}".format(
- data=data,
- json=json.dumps(data, separators=(",", ":"), sort_keys=True),
+ # Use None rather than False so as not to give the impression that
+ # pip knows it is not being run under CI. Rather, it is a null or
+ # inconclusive result. Also, we include some value rather than no
+ # value to make it easier to know that the check has been run.
+ data["ci"] = (
+ True if Telemetry.has_known_ci_sentinel(frozenset(env.keys())) else None
+ )
+
+ if (user_data := env.get("PIP_USER_AGENT_USER_DATA")) is not None:
+ data["user_data"] = user_data
+
+ return "{data[installer][name]}/{data[installer][version]} {json}".format(
+ data=data,
+ json=json.dumps(data, separators=(",", ":"), sort_keys=True),
+ )
+
+ # These are environment variables present when running under various
+ # CI systems. For each variable, some CI systems that use the variable
+ # are indicated. The collection was chosen so that for each of a number
+ # of popular systems, at least one of the environment variables is used.
+ # This list is used to provide some indication of and lower bound for
+ # CI traffic to PyPI. Thus, it is okay if the list is not comprehensive.
+ # For more background, see: https://github.com/pypa/pip/issues/5499
+ KNOWN_CI_SENTINEL_VARIABLES: ClassVar[tuple[str, ...]] = (
+ # Azure Pipelines
+ "BUILD_BUILDID",
+ # Jenkins
+ "BUILD_ID",
+ # AppVeyor, CircleCI, Codeship, Gitlab CI, Shippable, Travis CI
+ "CI",
+ # Explicit environment variable.
+ "PIP_IS_CI",
)
+ @staticmethod
+ def has_known_ci_sentinel(env: Set[str]) -> bool:
+ """
+ Return whether it looks like pip is running under CI.
+ """
+ # We don't use the method of checking for a tty (e.g. using isatty())
+ # because some CI systems mimic a tty (e.g. Travis CI). Thus that
+ # method doesn't provide definitive information in either direction.
+ return any(name in env for name in Telemetry.KNOWN_CI_SENTINEL_VARIABLES)
+
class LocalFSAdapter(BaseAdapter):
def send(
@@ -347,7 +480,7 @@ def __init__(
self.pip_proxy = None
# Attach our User Agent to the request
- self.headers["User-Agent"] = user_agent()
+ self.headers["User-Agent"] = Telemetry.user_agent_id()
# Attach our Authentication handler to the session
self.auth = MultiDomainBasicAuth(index_urls=index_urls)
diff --git a/tests/unit/test_network_session.py b/tests/unit/test_network_session.py
index b48be71fc51..7c6e6806525 100644
--- a/tests/unit/test_network_session.py
+++ b/tests/unit/test_network_session.py
@@ -14,16 +14,15 @@
from pip import __version__
from pip._internal.models.link import Link
from pip._internal.network.session import (
- CI_ENVIRONMENT_VARIABLES,
PipSession,
- user_agent,
+ Telemetry,
)
def get_user_agent() -> str:
# These tests are testing the computation of the user agent, so we want to
# avoid reusing cached values.
- user_agent.cache_clear()
+ Telemetry.user_agent_id.cache_clear()
return PipSession().headers["User-Agent"]
@@ -49,10 +48,10 @@ def test_user_agent__ci(
) -> None:
# Delete the variable names we use to check for CI to prevent the
# detection from always returning True in case the tests are being run
- # under actual CI. It is okay to depend on CI_ENVIRONMENT_VARIABLES
+ # under actual CI. It is okay to depend on KNOWN_CI_SENTINEL_VARIABLES
# here (part of the code under test) because this setup step can only
# prevent false test failures. It can't cause a false test passage.
- for ci_name in CI_ENVIRONMENT_VARIABLES:
+ for ci_name in Telemetry.KNOWN_CI_SENTINEL_VARIABLES:
monkeypatch.delenv(ci_name, raising=False)
# Confirm the baseline before setting the environment variable.
@@ -71,6 +70,11 @@ def test_user_agent_user_data(monkeypatch: pytest.MonkeyPatch) -> None:
assert "some_string" in get_user_agent()
+def test_clobber_user_agent(monkeypatch: pytest.MonkeyPatch) -> None:
+ monkeypatch.setenv(Telemetry.CLOBBER_USER_AGENT_ENV_VAR, "some_string")
+ assert "some_string" == get_user_agent()
+
+
class TestPipSession:
def test_cache_defaults_off(self) -> None:
session = PipSession()