Skip to content

Commit 93baede

Browse files
authored
legacy repository: prefer JSON API to HTML API (#10672)
Motivation: some information (e.g. size and upload-time) that may be required in future for features like pylock.toml and minimumReleaseAge/exclude-newer is only available via JSON Changes: * prefer JSON in legacy repositories and fallback to HTML if JSON is not supported * add support for JSON root pages * add support for relative URLs in JSON pages * add support for hashes in JSON pages * extend legacy tests so that they are run with the HTML variant and the JSON variant * harmonize HTML and JSON fixtures so that we get the same results in the tests
1 parent e7b52af commit 93baede

File tree

24 files changed

+650
-52
lines changed

24 files changed

+650
-52
lines changed

src/poetry/repositories/http_repository.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from poetry.repositories.exceptions import PackageNotFoundError
2727
from poetry.repositories.exceptions import RepositoryError
2828
from poetry.repositories.link_sources.html import HTMLPage
29+
from poetry.repositories.link_sources.json import SimpleJsonPage
2930
from poetry.utils.authenticator import Authenticator
3031
from poetry.utils.constants import REQUESTS_TIMEOUT
3132
from poetry.utils.helpers import HTTPRangeRequestSupportedError
@@ -417,11 +418,13 @@ def calculate_sha256(self, link: Link) -> str | None:
417418
return f"{required_hash.name}:{required_hash.hexdigest()}"
418419
return None
419420

420-
def _get_response(self, endpoint: str) -> requests.Response | None:
421+
def _get_response(
422+
self, endpoint: str, *, headers: dict[str, str] | None = None
423+
) -> requests.Response | None:
421424
url = self._url + endpoint
422425
try:
423426
response: requests.Response = self.session.get(
424-
url, raise_for_status=False, timeout=REQUESTS_TIMEOUT
427+
url, raise_for_status=False, timeout=REQUESTS_TIMEOUT, headers=headers
425428
)
426429
if response.status_code in (401, 403):
427430
self._log(
@@ -442,8 +445,25 @@ def _get_response(self, endpoint: str) -> requests.Response | None:
442445
)
443446
return response
444447

448+
def _get_prefer_json_header(self) -> dict[str, str]:
449+
# Prefer json, but accept anything for backwards compatibility.
450+
# Although the more specific value should be preferred to the less specific one
451+
# according to https://developer.mozilla.org/en-US/docs/Glossary/Quality_values,
452+
# we add a quality value because some servers still prefer html without one.
453+
return {"Accept": "application/vnd.pypi.simple.v1+json, */*;q=0.1"}
454+
455+
def _is_json_response(self, response: requests.Response) -> bool:
456+
return (
457+
response.headers.get("Content-Type", "").split(";")[0].strip()
458+
== "application/vnd.pypi.simple.v1+json"
459+
)
460+
445461
def _get_page(self, name: NormalizedName) -> LinkSource:
446-
response = self._get_response(f"/{name}/")
462+
response = self._get_response(
463+
f"/{name}/", headers=self._get_prefer_json_header()
464+
)
447465
if not response:
448466
raise PackageNotFoundError(f"Package [{name}] not found.")
467+
if self._is_json_response(response):
468+
return SimpleJsonPage(response.url, response.json())
449469
return HTMLPage(response.url, response.text)

src/poetry/repositories/legacy_repository.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
from poetry.inspection.info import PackageInfo
1313
from poetry.repositories.exceptions import PackageNotFoundError
1414
from poetry.repositories.http_repository import HTTPRepository
15-
from poetry.repositories.link_sources.html import HTMLPage
16-
from poetry.repositories.link_sources.html import SimpleRepositoryRootPage
15+
from poetry.repositories.link_sources.base import SimpleRepositoryRootPage
16+
from poetry.repositories.link_sources.html import SimpleRepositoryHTMLRootPage
17+
from poetry.repositories.link_sources.json import SimpleRepositoryJsonRootPage
1718

1819

1920
if TYPE_CHECKING:
@@ -130,21 +131,21 @@ def _get_release_info(
130131
),
131132
)
132133

133-
def _get_page(self, name: NormalizedName) -> HTMLPage:
134-
if not (response := self._get_response(f"/{name}/")):
135-
raise PackageNotFoundError(f"Package [{name}] not found.")
136-
return HTMLPage(response.url, response.text)
137-
138134
@cached_property
139135
def root_page(self) -> SimpleRepositoryRootPage:
140-
if not (response := self._get_response("/")):
136+
if not (
137+
response := self._get_response("/", headers=self._get_prefer_json_header())
138+
):
141139
self._log(
142140
f"Unable to retrieve package listing from package source {self.name}",
143141
level="error",
144142
)
145143
return SimpleRepositoryRootPage()
146144

147-
return SimpleRepositoryRootPage(response.text)
145+
if self._is_json_response(response):
146+
return SimpleRepositoryJsonRootPage(response.json())
147+
148+
return SimpleRepositoryHTMLRootPage(response.text)
148149

149150
def search(self, query: str | list[str]) -> list[Package]:
150151
results: list[Package] = []

src/poetry/repositories/link_sources/base.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,24 @@ def yanked(self, name: NormalizedName, version: Version) -> str | bool:
124124
@cached_property
125125
def _link_cache(self) -> LinkCache:
126126
raise NotImplementedError()
127+
128+
129+
class SimpleRepositoryRootPage:
130+
"""
131+
This class represents the parsed content of a "simple" repository's root page.
132+
"""
133+
134+
def search(self, query: str | list[str]) -> list[str]:
135+
results: list[str] = []
136+
tokens = query if isinstance(query, list) else [query]
137+
138+
for name in self.package_names:
139+
if any(token in name for token in tokens):
140+
results.append(name)
141+
142+
return results
143+
144+
@cached_property
145+
def package_names(self) -> list[str]:
146+
# should be overridden in subclasses
147+
return []

src/poetry/repositories/link_sources/html.py

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from poetry.core.packages.utils.link import Link
1111

1212
from poetry.repositories.link_sources.base import LinkSource
13+
from poetry.repositories.link_sources.base import SimpleRepositoryRootPage
1314
from poetry.repositories.parsers.html_page_parser import HTMLPageParser
1415

1516

@@ -68,10 +69,11 @@ def _link_cache(self) -> LinkCache:
6869
return links
6970

7071

71-
class SimpleRepositoryRootPage:
72+
class SimpleRepositoryHTMLRootPage(SimpleRepositoryRootPage):
7273
"""
73-
This class represents the parsed content of a "simple" repository's root page. This follows the
74-
specification laid out in PEP 503.
74+
This class represents the parsed content of the HTML version
75+
of a "simple" repository's root page.
76+
This follows the specification laid out in PEP 503.
7577
7678
See: https://peps.python.org/pep-0503/
7779
"""
@@ -81,17 +83,6 @@ def __init__(self, content: str | None = None) -> None:
8183
parser.feed(content or "")
8284
self._parsed = parser.anchors
8385

84-
def search(self, query: str | list[str]) -> list[str]:
85-
results: list[str] = []
86-
tokens = query if isinstance(query, list) else [query]
87-
88-
for anchor in self._parsed:
89-
href = anchor.get("href")
90-
if href and any(token in href for token in tokens):
91-
results.append(href.rstrip("/"))
92-
93-
return results
94-
9586
@cached_property
9687
def package_names(self) -> list[str]:
9788
results: list[str] = []

src/poetry/repositories/link_sources/json.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import annotations
22

3+
import urllib.parse
4+
35
from collections import defaultdict
46
from functools import cached_property
57
from typing import TYPE_CHECKING
@@ -8,6 +10,7 @@
810
from poetry.core.packages.utils.link import Link
911

1012
from poetry.repositories.link_sources.base import LinkSource
13+
from poetry.repositories.link_sources.base import SimpleRepositoryRootPage
1114

1215

1316
if TYPE_CHECKING:
@@ -25,8 +28,9 @@ def __init__(self, url: str, content: dict[str, Any]) -> None:
2528
def _link_cache(self) -> LinkCache:
2629
links: LinkCache = defaultdict(lambda: defaultdict(list))
2730
for file in self.content["files"]:
28-
url = file["url"]
31+
url = self.clean_link(urllib.parse.urljoin(self._url, file["url"]))
2932
requires_python = file.get("requires-python")
33+
hashes = file.get("hashes", {})
3034
yanked = file.get("yanked", False)
3135

3236
# see https://peps.python.org/pep-0714/#clients
@@ -42,7 +46,11 @@ def _link_cache(self) -> LinkCache:
4246
break
4347

4448
link = Link(
45-
url, requires_python=requires_python, yanked=yanked, metadata=metadata
49+
url,
50+
requires_python=requires_python,
51+
hashes=hashes,
52+
yanked=yanked,
53+
metadata=metadata,
4654
)
4755

4856
if link.ext not in self.SUPPORTED_FORMATS:
@@ -53,3 +61,26 @@ def _link_cache(self) -> LinkCache:
5361
links[pkg.name][pkg.version].append(link)
5462

5563
return links
64+
65+
66+
class SimpleRepositoryJsonRootPage(SimpleRepositoryRootPage):
67+
"""
68+
This class represents the parsed content of the JSON version
69+
of a "simple" repository's root page.
70+
This follows the specification laid out in PEP 691.
71+
72+
See: https://peps.python.org/pep-0691/
73+
"""
74+
75+
def __init__(self, content: dict[str, Any]) -> None:
76+
self._content = content
77+
78+
@cached_property
79+
def package_names(self) -> list[str]:
80+
results: list[str] = []
81+
82+
for project in self._content.get("projects", []):
83+
if name := project.get("name"):
84+
results.append(name)
85+
86+
return results

tests/console/commands/test_search.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,10 @@ def test_search_only_legacy_repository(
113113
tester.execute("ipython")
114114

115115
expected = """\
116-
Package Version Source Description
117-
ipython 5.7.0 legacy
118-
ipython 7.5.0 legacy
116+
Package Version Source Description
117+
ipython 4.1.0rc1 legacy
118+
ipython 5.7.0 legacy
119+
ipython 7.5.0 legacy
119120
"""
120121

121122
output = clean_output(tester.io.fetch_output())
@@ -133,11 +134,12 @@ def test_search_multiple_queries(
133134
tester.execute("ipython isort")
134135

135136
expected = """\
136-
Package Version Source Description
137-
ipython 5.7.0 legacy
138-
ipython 7.5.0 legacy
139-
isort 4.3.4 legacy
140-
isort-metadata 4.3.4 legacy
137+
Package Version Source Description
138+
ipython 4.1.0rc1 legacy
139+
ipython 5.7.0 legacy
140+
ipython 7.5.0 legacy
141+
isort 4.3.4 legacy
142+
isort-metadata 4.3.4 legacy
141143
"""
142144

143145
output = clean_output(tester.io.fetch_output())

tests/installation/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def env() -> MockEnv:
2121

2222

2323
@pytest.fixture()
24-
def pool(legacy_repository: LegacyRepository) -> RepositoryPool:
24+
def pool(legacy_repository_html: LegacyRepository) -> RepositoryPool:
2525
pool = RepositoryPool()
2626

2727
pool.add_repository(PyPiRepository(disable_cache=True))

tests/puzzle/test_solver.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,16 @@
5656
)
5757

5858

59+
@pytest.fixture
60+
def legacy_repository(legacy_repository_html: LegacyRepository) -> LegacyRepository:
61+
"""
62+
Override fixture to only test with the html version of the legacy repository
63+
because the json version has the same packages as the PyPI repository and thus
64+
cause different results in the tests that rely on differences.
65+
"""
66+
return legacy_repository_html
67+
68+
5969
def set_package_python_versions(provider: Provider, python_versions: str) -> None:
6070
provider._package.python_versions = python_versions
6171
provider._package_python_constraint = provider._package.python_constraint

0 commit comments

Comments
 (0)