Skip to content

Commit dc00479

Browse files
authored
Merge pull request #11158 from dstufft/pep691
PEP 691: JSON-based simple index API interaction
2 parents e89e391 + c1b50e1 commit dc00479

File tree

5 files changed

+283
-117
lines changed

5 files changed

+283
-117
lines changed

news/11158.feature.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Support `PEP 691 <https://peps.python.org/pep-0691/>`_.

src/pip/_internal/index/collector.py

Lines changed: 116 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import email.message
77
import functools
88
import itertools
9+
import json
910
import logging
1011
import os
1112
import re
@@ -65,32 +66,46 @@ def _match_vcs_scheme(url: str) -> Optional[str]:
6566
return None
6667

6768

68-
class _NotHTML(Exception):
69+
class _NotAPIContent(Exception):
6970
def __init__(self, content_type: str, request_desc: str) -> None:
7071
super().__init__(content_type, request_desc)
7172
self.content_type = content_type
7273
self.request_desc = request_desc
7374

7475

75-
def _ensure_html_header(response: Response) -> None:
76-
"""Check the Content-Type header to ensure the response contains HTML.
76+
def _ensure_api_header(response: Response) -> None:
77+
"""
78+
Check the Content-Type header to ensure the response contains a Simple
79+
API Response.
7780
78-
Raises `_NotHTML` if the content type is not text/html.
81+
Raises `_NotAPIContent` if the content type is not a valid content-type.
7982
"""
80-
content_type = response.headers.get("Content-Type", "")
81-
if not content_type.lower().startswith("text/html"):
82-
raise _NotHTML(content_type, response.request.method)
83+
content_type = response.headers.get("Content-Type", "Unknown")
84+
85+
content_type_l = content_type.lower()
86+
if content_type_l.startswith(
87+
(
88+
"text/html",
89+
"application/vnd.pypi.simple.v1+html",
90+
"application/vnd.pypi.simple.v1+json",
91+
)
92+
):
93+
return
94+
95+
raise _NotAPIContent(content_type, response.request.method)
8396

8497

8598
class _NotHTTP(Exception):
8699
pass
87100

88101

89-
def _ensure_html_response(url: str, session: PipSession) -> None:
90-
"""Send a HEAD request to the URL, and ensure the response contains HTML.
102+
def _ensure_api_response(url: str, session: PipSession) -> None:
103+
"""
104+
Send a HEAD request to the URL, and ensure the response contains a simple
105+
API Response.
91106
92107
Raises `_NotHTTP` if the URL is not available for a HEAD request, or
93-
`_NotHTML` if the content type is not text/html.
108+
`_NotAPIContent` if the content type is not a valid content type.
94109
"""
95110
scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
96111
if scheme not in {"http", "https"}:
@@ -99,31 +114,37 @@ def _ensure_html_response(url: str, session: PipSession) -> None:
99114
resp = session.head(url, allow_redirects=True)
100115
raise_for_status(resp)
101116

102-
_ensure_html_header(resp)
117+
_ensure_api_header(resp)
103118

104119

105-
def _get_html_response(url: str, session: PipSession) -> Response:
106-
"""Access an HTML page with GET, and return the response.
120+
def _get_simple_response(url: str, session: PipSession) -> Response:
121+
"""Access an Simple API response with GET, and return the response.
107122
108123
This consists of three parts:
109124
110125
1. If the URL looks suspiciously like an archive, send a HEAD first to
111-
check the Content-Type is HTML, to avoid downloading a large file.
112-
Raise `_NotHTTP` if the content type cannot be determined, or
113-
`_NotHTML` if it is not HTML.
126+
check the Content-Type is HTML or Simple API, to avoid downloading a
127+
large file. Raise `_NotHTTP` if the content type cannot be determined, or
128+
`_NotAPIContent` if it is not HTML or a Simple API.
114129
2. Actually perform the request. Raise HTTP exceptions on network failures.
115-
3. Check the Content-Type header to make sure we got HTML, and raise
116-
`_NotHTML` otherwise.
130+
3. Check the Content-Type header to make sure we got a Simple API response,
131+
and raise `_NotAPIContent` otherwise.
117132
"""
118133
if is_archive_file(Link(url).filename):
119-
_ensure_html_response(url, session=session)
134+
_ensure_api_response(url, session=session)
120135

121136
logger.debug("Getting page %s", redact_auth_from_url(url))
122137

123138
resp = session.get(
124139
url,
125140
headers={
126-
"Accept": "text/html",
141+
"Accept": ", ".join(
142+
[
143+
"application/vnd.pypi.simple.v1+json",
144+
"application/vnd.pypi.simple.v1+html; q=0.1",
145+
"text/html; q=0.01",
146+
]
147+
),
127148
# We don't want to blindly returned cached data for
128149
# /simple/, because authors generally expecting that
129150
# twine upload && pip install will function, but if
@@ -145,9 +166,16 @@ def _get_html_response(url: str, session: PipSession) -> Response:
145166
# The check for archives above only works if the url ends with
146167
# something that looks like an archive. However that is not a
147168
# requirement of an url. Unless we issue a HEAD request on every
148-
# url we cannot know ahead of time for sure if something is HTML
149-
# or not. However we can check after we've downloaded it.
150-
_ensure_html_header(resp)
169+
# url we cannot know ahead of time for sure if something is a
170+
# Simple API response or not. However we can check after we've
171+
# downloaded it.
172+
_ensure_api_header(resp)
173+
174+
logger.debug(
175+
"Fetched page %s as %s",
176+
redact_auth_from_url(url),
177+
resp.headers.get("Content-Type", "Unknown"),
178+
)
151179

152180
return resp
153181

@@ -273,7 +301,7 @@ def _create_link_from_element(
273301

274302

275303
class CacheablePageContent:
276-
def __init__(self, page: "HTMLPage") -> None:
304+
def __init__(self, page: "IndexContent") -> None:
277305
assert page.cache_link_parsing
278306
self.page = page
279307

@@ -286,15 +314,15 @@ def __hash__(self) -> int:
286314

287315
class ParseLinks(Protocol):
288316
def __call__(
289-
self, page: "HTMLPage", use_deprecated_html5lib: bool
317+
self, page: "IndexContent", use_deprecated_html5lib: bool
290318
) -> Iterable[Link]:
291319
...
292320

293321

294-
def with_cached_html_pages(fn: ParseLinks) -> ParseLinks:
322+
def with_cached_index_content(fn: ParseLinks) -> ParseLinks:
295323
"""
296-
Given a function that parses an Iterable[Link] from an HTMLPage, cache the
297-
function's result (keyed by CacheablePageContent), unless the HTMLPage
324+
Given a function that parses an Iterable[Link] from an IndexContent, cache the
325+
function's result (keyed by CacheablePageContent), unless the IndexContent
298326
`page` has `page.cache_link_parsing == False`.
299327
"""
300328

@@ -305,15 +333,17 @@ def wrapper(
305333
return list(fn(cacheable_page.page, use_deprecated_html5lib))
306334

307335
@functools.wraps(fn)
308-
def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]:
336+
def wrapper_wrapper(
337+
page: "IndexContent", use_deprecated_html5lib: bool
338+
) -> List[Link]:
309339
if page.cache_link_parsing:
310340
return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
311341
return list(fn(page, use_deprecated_html5lib))
312342

313343
return wrapper_wrapper
314344

315345

316-
def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
346+
def _parse_links_html5lib(page: "IndexContent") -> Iterable[Link]:
317347
"""
318348
Parse an HTML document, and yield its anchor elements as Link objects.
319349
@@ -338,12 +368,36 @@ def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
338368
yield link
339369

340370

341-
@with_cached_html_pages
342-
def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]:
371+
@with_cached_index_content
372+
def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable[Link]:
343373
"""
344-
Parse an HTML document, and yield its anchor elements as Link objects.
374+
Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
345375
"""
346376

377+
content_type_l = page.content_type.lower()
378+
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
379+
data = json.loads(page.content)
380+
for file in data.get("files", []):
381+
file_url = file.get("url")
382+
if file_url is None:
383+
continue
384+
385+
# The Link.yanked_reason expects an empty string instead of a boolean.
386+
yanked_reason = file.get("yanked")
387+
if yanked_reason and not isinstance(yanked_reason, str):
388+
yanked_reason = ""
389+
# The Link.yanked_reason expects None instead of False
390+
elif not yanked_reason:
391+
yanked_reason = None
392+
393+
yield Link(
394+
_clean_link(urllib.parse.urljoin(page.url, file_url)),
395+
comes_from=page.url,
396+
requires_python=file.get("requires-python"),
397+
yanked_reason=yanked_reason,
398+
hashes=file.get("hashes", {}),
399+
)
400+
347401
if use_deprecated_html5lib:
348402
yield from _parse_links_html5lib(page)
349403
return
@@ -365,12 +419,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
365419
yield link
366420

367421

368-
class HTMLPage:
369-
"""Represents one page, along with its URL"""
422+
class IndexContent:
423+
"""Represents one response (or page), along with its URL"""
370424

371425
def __init__(
372426
self,
373427
content: bytes,
428+
content_type: str,
374429
encoding: Optional[str],
375430
url: str,
376431
cache_link_parsing: bool = True,
@@ -383,6 +438,7 @@ def __init__(
383438
have this set to False, for example.
384439
"""
385440
self.content = content
441+
self.content_type = content_type
386442
self.encoding = encoding
387443
self.url = url
388444
self.cache_link_parsing = cache_link_parsing
@@ -419,7 +475,7 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
419475
return None
420476

421477

422-
def _handle_get_page_fail(
478+
def _handle_get_simple_fail(
423479
link: Link,
424480
reason: Union[str, Exception],
425481
meth: Optional[Callable[..., None]] = None,
@@ -429,19 +485,22 @@ def _handle_get_page_fail(
429485
meth("Could not fetch URL %s: %s - skipping", link, reason)
430486

431487

432-
def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage:
488+
def _make_index_content(
489+
response: Response, cache_link_parsing: bool = True
490+
) -> IndexContent:
433491
encoding = _get_encoding_from_headers(response.headers)
434-
return HTMLPage(
492+
return IndexContent(
435493
response.content,
494+
response.headers["Content-Type"],
436495
encoding=encoding,
437496
url=response.url,
438497
cache_link_parsing=cache_link_parsing,
439498
)
440499

441500

442-
def _get_html_page(
501+
def _get_index_content(
443502
link: Link, session: Optional[PipSession] = None
444-
) -> Optional["HTMLPage"]:
503+
) -> Optional["IndexContent"]:
445504
if session is None:
446505
raise TypeError(
447506
"_get_html_page() missing 1 required keyword argument: 'session'"
@@ -466,39 +525,44 @@ def _get_html_page(
466525
# final segment
467526
if not url.endswith("/"):
468527
url += "/"
528+
# TODO: In the future, it would be nice if pip supported PEP 691
529+
# style respones in the file:// URLs, however there's no
530+
# standard file extension for application/vnd.pypi.simple.v1+json
531+
# so we'll need to come up with something on our own.
469532
url = urllib.parse.urljoin(url, "index.html")
470533
logger.debug(" file: URL is directory, getting %s", url)
471534

472535
try:
473-
resp = _get_html_response(url, session=session)
536+
resp = _get_simple_response(url, session=session)
474537
except _NotHTTP:
475538
logger.warning(
476539
"Skipping page %s because it looks like an archive, and cannot "
477540
"be checked by a HTTP HEAD request.",
478541
link,
479542
)
480-
except _NotHTML as exc:
543+
except _NotAPIContent as exc:
481544
logger.warning(
482-
"Skipping page %s because the %s request got Content-Type: %s."
483-
"The only supported Content-Type is text/html",
545+
"Skipping page %s because the %s request got Content-Type: %s. "
546+
"The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
547+
"application/vnd.pypi.simple.v1+html, and text/html",
484548
link,
485549
exc.request_desc,
486550
exc.content_type,
487551
)
488552
except NetworkConnectionError as exc:
489-
_handle_get_page_fail(link, exc)
553+
_handle_get_simple_fail(link, exc)
490554
except RetryError as exc:
491-
_handle_get_page_fail(link, exc)
555+
_handle_get_simple_fail(link, exc)
492556
except SSLError as exc:
493557
reason = "There was a problem confirming the ssl certificate: "
494558
reason += str(exc)
495-
_handle_get_page_fail(link, reason, meth=logger.info)
559+
_handle_get_simple_fail(link, reason, meth=logger.info)
496560
except requests.ConnectionError as exc:
497-
_handle_get_page_fail(link, f"connection error: {exc}")
561+
_handle_get_simple_fail(link, f"connection error: {exc}")
498562
except requests.Timeout:
499-
_handle_get_page_fail(link, "timed out")
563+
_handle_get_simple_fail(link, "timed out")
500564
else:
501-
return _make_html_page(resp, cache_link_parsing=link.cache_link_parsing)
565+
return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing)
502566
return None
503567

504568

@@ -561,11 +625,11 @@ def create(
561625
def find_links(self) -> List[str]:
562626
return self.search_scope.find_links
563627

564-
def fetch_page(self, location: Link) -> Optional[HTMLPage]:
628+
def fetch_response(self, location: Link) -> Optional[IndexContent]:
565629
"""
566630
Fetch an HTML page containing package links.
567631
"""
568-
return _get_html_page(location, session=self.session)
632+
return _get_index_content(location, session=self.session)
569633

570634
def collect_sources(
571635
self,

src/pip/_internal/index/package_finder.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -792,11 +792,11 @@ def process_project_url(
792792
"Fetching project page and analyzing links: %s",
793793
project_url,
794794
)
795-
html_page = self._link_collector.fetch_page(project_url)
796-
if html_page is None:
795+
index_response = self._link_collector.fetch_response(project_url)
796+
if index_response is None:
797797
return []
798798

799-
page_links = list(parse_links(html_page, self._use_deprecated_html5lib))
799+
page_links = list(parse_links(index_response, self._use_deprecated_html5lib))
800800

801801
with indent_log():
802802
package_links = self.evaluate_links(

0 commit comments

Comments
 (0)