6
6
import email .message
7
7
import functools
8
8
import itertools
9
+ import json
9
10
import logging
10
11
import os
11
12
import re
@@ -65,32 +66,46 @@ def _match_vcs_scheme(url: str) -> Optional[str]:
65
66
return None
66
67
67
68
68
- class _NotHTML (Exception ):
69
+ class _NotAPIContent (Exception ):
69
70
def __init__ (self , content_type : str , request_desc : str ) -> None :
70
71
super ().__init__ (content_type , request_desc )
71
72
self .content_type = content_type
72
73
self .request_desc = request_desc
73
74
74
75
75
- def _ensure_html_header (response : Response ) -> None :
76
- """Check the Content-Type header to ensure the response contains HTML.
76
+ def _ensure_api_header (response : Response ) -> None :
77
+ """
78
+ Check the Content-Type header to ensure the response contains a Simple
79
+ API Response.
77
80
78
- Raises `_NotHTML ` if the content type is not text/html .
81
+ Raises `_NotAPIContent ` if the content type is not a valid content-type .
79
82
"""
80
- content_type = response .headers .get ("Content-Type" , "" )
81
- if not content_type .lower ().startswith ("text/html" ):
82
- raise _NotHTML (content_type , response .request .method )
83
+ content_type = response .headers .get ("Content-Type" , "Unknown" )
84
+
85
+ content_type_l = content_type .lower ()
86
+ if content_type_l .startswith (
87
+ (
88
+ "text/html" ,
89
+ "application/vnd.pypi.simple.v1+html" ,
90
+ "application/vnd.pypi.simple.v1+json" ,
91
+ )
92
+ ):
93
+ return
94
+
95
+ raise _NotAPIContent (content_type , response .request .method )
83
96
84
97
85
98
class _NotHTTP (Exception ):
86
99
pass
87
100
88
101
89
- def _ensure_html_response (url : str , session : PipSession ) -> None :
90
- """Send a HEAD request to the URL, and ensure the response contains HTML.
102
+ def _ensure_api_response (url : str , session : PipSession ) -> None :
103
+ """
104
+ Send a HEAD request to the URL, and ensure the response contains a simple
105
+ API Response.
91
106
92
107
Raises `_NotHTTP` if the URL is not available for a HEAD request, or
93
- `_NotHTML ` if the content type is not text/html .
108
+ `_NotAPIContent ` if the content type is not a valid content type .
94
109
"""
95
110
scheme , netloc , path , query , fragment = urllib .parse .urlsplit (url )
96
111
if scheme not in {"http" , "https" }:
@@ -99,31 +114,37 @@ def _ensure_html_response(url: str, session: PipSession) -> None:
99
114
resp = session .head (url , allow_redirects = True )
100
115
raise_for_status (resp )
101
116
102
- _ensure_html_header (resp )
117
+ _ensure_api_header (resp )
103
118
104
119
105
- def _get_html_response (url : str , session : PipSession ) -> Response :
106
- """Access an HTML page with GET, and return the response.
120
+ def _get_simple_response (url : str , session : PipSession ) -> Response :
121
+ """Access an Simple API response with GET, and return the response.
107
122
108
123
This consists of three parts:
109
124
110
125
1. If the URL looks suspiciously like an archive, send a HEAD first to
111
- check the Content-Type is HTML, to avoid downloading a large file.
112
- Raise `_NotHTTP` if the content type cannot be determined, or
113
- `_NotHTML ` if it is not HTML.
126
+ check the Content-Type is HTML or Simple API , to avoid downloading a
127
+ large file. Raise `_NotHTTP` if the content type cannot be determined, or
128
+ `_NotAPIContent ` if it is not HTML or a Simple API .
114
129
2. Actually perform the request. Raise HTTP exceptions on network failures.
115
- 3. Check the Content-Type header to make sure we got HTML, and raise
116
- `_NotHTML ` otherwise.
130
+ 3. Check the Content-Type header to make sure we got a Simple API response,
131
+ and raise `_NotAPIContent ` otherwise.
117
132
"""
118
133
if is_archive_file (Link (url ).filename ):
119
- _ensure_html_response (url , session = session )
134
+ _ensure_api_response (url , session = session )
120
135
121
136
logger .debug ("Getting page %s" , redact_auth_from_url (url ))
122
137
123
138
resp = session .get (
124
139
url ,
125
140
headers = {
126
- "Accept" : "text/html" ,
141
+ "Accept" : ", " .join (
142
+ [
143
+ "application/vnd.pypi.simple.v1+json" ,
144
+ "application/vnd.pypi.simple.v1+html; q=0.1" ,
145
+ "text/html; q=0.01" ,
146
+ ]
147
+ ),
127
148
# We don't want to blindly returned cached data for
128
149
# /simple/, because authors generally expecting that
129
150
# twine upload && pip install will function, but if
@@ -145,9 +166,16 @@ def _get_html_response(url: str, session: PipSession) -> Response:
145
166
# The check for archives above only works if the url ends with
146
167
# something that looks like an archive. However that is not a
147
168
# requirement of an url. Unless we issue a HEAD request on every
148
- # url we cannot know ahead of time for sure if something is HTML
149
- # or not. However we can check after we've downloaded it.
150
- _ensure_html_header (resp )
169
+ # url we cannot know ahead of time for sure if something is a
170
+ # Simple API response or not. However we can check after we've
171
+ # downloaded it.
172
+ _ensure_api_header (resp )
173
+
174
+ logger .debug (
175
+ "Fetched page %s as %s" ,
176
+ redact_auth_from_url (url ),
177
+ resp .headers .get ("Content-Type" , "Unknown" ),
178
+ )
151
179
152
180
return resp
153
181
@@ -273,7 +301,7 @@ def _create_link_from_element(
273
301
274
302
275
303
class CacheablePageContent :
276
- def __init__ (self , page : "HTMLPage " ) -> None :
304
+ def __init__ (self , page : "IndexContent " ) -> None :
277
305
assert page .cache_link_parsing
278
306
self .page = page
279
307
@@ -286,15 +314,15 @@ def __hash__(self) -> int:
286
314
287
315
class ParseLinks (Protocol ):
288
316
def __call__ (
289
- self , page : "HTMLPage " , use_deprecated_html5lib : bool
317
+ self , page : "IndexContent " , use_deprecated_html5lib : bool
290
318
) -> Iterable [Link ]:
291
319
...
292
320
293
321
294
- def with_cached_html_pages (fn : ParseLinks ) -> ParseLinks :
322
+ def with_cached_index_content (fn : ParseLinks ) -> ParseLinks :
295
323
"""
296
- Given a function that parses an Iterable[Link] from an HTMLPage , cache the
297
- function's result (keyed by CacheablePageContent), unless the HTMLPage
324
+ Given a function that parses an Iterable[Link] from an IndexContent , cache the
325
+ function's result (keyed by CacheablePageContent), unless the IndexContent
298
326
`page` has `page.cache_link_parsing == False`.
299
327
"""
300
328
@@ -305,15 +333,17 @@ def wrapper(
305
333
return list (fn (cacheable_page .page , use_deprecated_html5lib ))
306
334
307
335
@functools .wraps (fn )
308
- def wrapper_wrapper (page : "HTMLPage" , use_deprecated_html5lib : bool ) -> List [Link ]:
336
+ def wrapper_wrapper (
337
+ page : "IndexContent" , use_deprecated_html5lib : bool
338
+ ) -> List [Link ]:
309
339
if page .cache_link_parsing :
310
340
return wrapper (CacheablePageContent (page ), use_deprecated_html5lib )
311
341
return list (fn (page , use_deprecated_html5lib ))
312
342
313
343
return wrapper_wrapper
314
344
315
345
316
- def _parse_links_html5lib (page : "HTMLPage " ) -> Iterable [Link ]:
346
+ def _parse_links_html5lib (page : "IndexContent " ) -> Iterable [Link ]:
317
347
"""
318
348
Parse an HTML document, and yield its anchor elements as Link objects.
319
349
@@ -338,12 +368,36 @@ def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
338
368
yield link
339
369
340
370
341
- @with_cached_html_pages
342
- def parse_links (page : "HTMLPage " , use_deprecated_html5lib : bool ) -> Iterable [Link ]:
371
+ @with_cached_index_content
372
+ def parse_links (page : "IndexContent " , use_deprecated_html5lib : bool ) -> Iterable [Link ]:
343
373
"""
344
- Parse an HTML document , and yield its anchor elements as Link objects.
374
+ Parse a Simple API's Index Content , and yield its anchor elements as Link objects.
345
375
"""
346
376
377
+ content_type_l = page .content_type .lower ()
378
+ if content_type_l .startswith ("application/vnd.pypi.simple.v1+json" ):
379
+ data = json .loads (page .content )
380
+ for file in data .get ("files" , []):
381
+ file_url = file .get ("url" )
382
+ if file_url is None :
383
+ continue
384
+
385
+ # The Link.yanked_reason expects an empty string instead of a boolean.
386
+ yanked_reason = file .get ("yanked" )
387
+ if yanked_reason and not isinstance (yanked_reason , str ):
388
+ yanked_reason = ""
389
+ # The Link.yanked_reason expects None instead of False
390
+ elif not yanked_reason :
391
+ yanked_reason = None
392
+
393
+ yield Link (
394
+ _clean_link (urllib .parse .urljoin (page .url , file_url )),
395
+ comes_from = page .url ,
396
+ requires_python = file .get ("requires-python" ),
397
+ yanked_reason = yanked_reason ,
398
+ hashes = file .get ("hashes" , {}),
399
+ )
400
+
347
401
if use_deprecated_html5lib :
348
402
yield from _parse_links_html5lib (page )
349
403
return
@@ -365,12 +419,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
365
419
yield link
366
420
367
421
368
- class HTMLPage :
369
- """Represents one page, along with its URL"""
422
+ class IndexContent :
423
+ """Represents one response (or page) , along with its URL"""
370
424
371
425
def __init__ (
372
426
self ,
373
427
content : bytes ,
428
+ content_type : str ,
374
429
encoding : Optional [str ],
375
430
url : str ,
376
431
cache_link_parsing : bool = True ,
@@ -383,6 +438,7 @@ def __init__(
383
438
have this set to False, for example.
384
439
"""
385
440
self .content = content
441
+ self .content_type = content_type
386
442
self .encoding = encoding
387
443
self .url = url
388
444
self .cache_link_parsing = cache_link_parsing
@@ -419,7 +475,7 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
419
475
return None
420
476
421
477
422
- def _handle_get_page_fail (
478
+ def _handle_get_simple_fail (
423
479
link : Link ,
424
480
reason : Union [str , Exception ],
425
481
meth : Optional [Callable [..., None ]] = None ,
@@ -429,19 +485,22 @@ def _handle_get_page_fail(
429
485
meth ("Could not fetch URL %s: %s - skipping" , link , reason )
430
486
431
487
432
- def _make_html_page (response : Response , cache_link_parsing : bool = True ) -> HTMLPage :
488
+ def _make_index_content (
489
+ response : Response , cache_link_parsing : bool = True
490
+ ) -> IndexContent :
433
491
encoding = _get_encoding_from_headers (response .headers )
434
- return HTMLPage (
492
+ return IndexContent (
435
493
response .content ,
494
+ response .headers ["Content-Type" ],
436
495
encoding = encoding ,
437
496
url = response .url ,
438
497
cache_link_parsing = cache_link_parsing ,
439
498
)
440
499
441
500
442
- def _get_html_page (
501
+ def _get_index_content (
443
502
link : Link , session : Optional [PipSession ] = None
444
- ) -> Optional ["HTMLPage " ]:
503
+ ) -> Optional ["IndexContent " ]:
445
504
if session is None :
446
505
raise TypeError (
447
506
"_get_html_page() missing 1 required keyword argument: 'session'"
@@ -466,39 +525,44 @@ def _get_html_page(
466
525
# final segment
467
526
if not url .endswith ("/" ):
468
527
url += "/"
528
+ # TODO: In the future, it would be nice if pip supported PEP 691
529
+ # style respones in the file:// URLs, however there's no
530
+ # standard file extension for application/vnd.pypi.simple.v1+json
531
+ # so we'll need to come up with something on our own.
469
532
url = urllib .parse .urljoin (url , "index.html" )
470
533
logger .debug (" file: URL is directory, getting %s" , url )
471
534
472
535
try :
473
- resp = _get_html_response (url , session = session )
536
+ resp = _get_simple_response (url , session = session )
474
537
except _NotHTTP :
475
538
logger .warning (
476
539
"Skipping page %s because it looks like an archive, and cannot "
477
540
"be checked by a HTTP HEAD request." ,
478
541
link ,
479
542
)
480
- except _NotHTML as exc :
543
+ except _NotAPIContent as exc :
481
544
logger .warning (
482
- "Skipping page %s because the %s request got Content-Type: %s."
483
- "The only supported Content-Type is text/html" ,
545
+ "Skipping page %s because the %s request got Content-Type: %s. "
546
+ "The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
547
+ "application/vnd.pypi.simple.v1+html, and text/html" ,
484
548
link ,
485
549
exc .request_desc ,
486
550
exc .content_type ,
487
551
)
488
552
except NetworkConnectionError as exc :
489
- _handle_get_page_fail (link , exc )
553
+ _handle_get_simple_fail (link , exc )
490
554
except RetryError as exc :
491
- _handle_get_page_fail (link , exc )
555
+ _handle_get_simple_fail (link , exc )
492
556
except SSLError as exc :
493
557
reason = "There was a problem confirming the ssl certificate: "
494
558
reason += str (exc )
495
- _handle_get_page_fail (link , reason , meth = logger .info )
559
+ _handle_get_simple_fail (link , reason , meth = logger .info )
496
560
except requests .ConnectionError as exc :
497
- _handle_get_page_fail (link , f"connection error: { exc } " )
561
+ _handle_get_simple_fail (link , f"connection error: { exc } " )
498
562
except requests .Timeout :
499
- _handle_get_page_fail (link , "timed out" )
563
+ _handle_get_simple_fail (link , "timed out" )
500
564
else :
501
- return _make_html_page (resp , cache_link_parsing = link .cache_link_parsing )
565
+ return _make_index_content (resp , cache_link_parsing = link .cache_link_parsing )
502
566
return None
503
567
504
568
@@ -561,11 +625,11 @@ def create(
561
625
def find_links (self ) -> List [str ]:
562
626
return self .search_scope .find_links
563
627
564
- def fetch_page (self , location : Link ) -> Optional [HTMLPage ]:
628
+ def fetch_response (self , location : Link ) -> Optional [IndexContent ]:
565
629
"""
566
630
Fetch an HTML page containing package links.
567
631
"""
568
- return _get_html_page (location , session = self .session )
632
+ return _get_index_content (location , session = self .session )
569
633
570
634
def collect_sources (
571
635
self ,
0 commit comments