Skip to content

Commit 7413dfd

Browse files
authored
ci: improve NPM download parameters (#380)
The api from where we retrieve the top NPM packages was some times returning 5xx errors, so I've lowered the number of items per page and increased the number of retries and the timeout.
1 parent bdac02f commit 7413dfd

File tree

3 files changed

+56
-51
lines changed

3 files changed

+56
-51
lines changed

dependencies/npm.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dependencies/pypi.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dependencies/scripts/download_packages.py

Lines changed: 54 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22
import logging
33
from collections.abc import Callable
4-
from dataclasses import dataclass
4+
from dataclasses import dataclass, field
55
from datetime import datetime
66
from pathlib import Path
77
from typing import Any
@@ -19,12 +19,18 @@
1919
)
2020

2121

22-
def parse_npm(data: list[dict[str, Any]]) -> list[str]:
23-
return [x["name"] for x in data]
22+
DEPENDENCIES_DIR = "dependencies"
23+
TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
24+
TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
25+
TIMEOUT = 90
2426

2527

26-
def parse_pypi(data: dict[str, Any]) -> list[str]:
27-
return [row["project"] for row in data["rows"]]
28+
def parse_npm(data: list[dict[str, Any]]) -> set[str]:
29+
return {x["name"] for x in data}
30+
31+
32+
def parse_pypi(data: dict[str, Any]) -> set[str]:
33+
return {row["project"] for row in data["rows"]}
2834

2935

3036
class ServerError(Exception):
@@ -34,28 +40,25 @@ class ServerError(Exception):
3440
@dataclass(frozen=True)
3541
class Ecosystem:
3642
url: str
37-
params: dict[str, Any] | None
38-
pages: int | None
39-
parser: Callable[[dict[str, Any]], list[str]]
40-
43+
parser: Callable[[Any], set[str]]
44+
params: dict[str, Any] = field(default_factory=dict)
45+
pages: int | None = None
4146

42-
@dataclass(frozen=True)
43-
class PypiEcosystem(Ecosystem):
44-
url = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
45-
params = None
46-
pages = None
47-
parser = parse_pypi
4847

48+
pypi_ecosystem = Ecosystem(
49+
url=TOP_PYPI_SOURCE,
50+
parser=parse_pypi,
51+
)
4952

50-
@dataclass(frozen=True)
51-
class NpmEcosystem(Ecosystem):
52-
url = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
53-
params = {"per_page": 1000, "sort": "downloads"}
54-
pages = 15
55-
parser = parse_npm
53+
npm_ecosystem = Ecosystem(
54+
url=TOP_NPM_SOURCE,
55+
parser=parse_npm,
56+
params={"per_page": 100, "sort": "downloads"},
57+
pages=150,
58+
)
5659

5760

58-
ECOSYSTEMS = {"pypi": PypiEcosystem, "npm": NpmEcosystem}
61+
ECOSYSTEMS = {"pypi": pypi_ecosystem, "npm": npm_ecosystem}
5962

6063

6164
@click.group()
@@ -72,39 +75,41 @@ def entry_point() -> None:
7275
def download(
7376
ecosystem: str,
7477
) -> None:
75-
selected_ecosystem = ECOSYSTEMS[ecosystem]
78+
if ecosystem not in ECOSYSTEMS:
79+
raise click.BadParameter("Not a valid ecosystem")
7680

77-
if pages := selected_ecosystem.pages:
78-
all_packages: list[str] = []
81+
selected_ecosystem = ECOSYSTEMS[ecosystem]
82+
all_packages: set[str] = set()
7983

80-
for page in range(1, pages + 1):
81-
params = selected_ecosystem.params or {}
84+
n_pages = selected_ecosystem.pages or 1
85+
params = selected_ecosystem.params.copy()
86+
for page in range(1, n_pages + 1):
87+
if selected_ecosystem.pages:
8288
params["page"] = page
83-
all_packages.extend(get_packages(selected_ecosystem.url, selected_ecosystem.parser, params))
84-
else:
85-
all_packages = get_packages(selected_ecosystem.url, selected_ecosystem.parser, selected_ecosystem.params)
8689

87-
fpath = Path("dependencies") / f"{ecosystem}.json"
88-
save_data_to_file(all_packages, fpath)
90+
all_packages.update(get_packages(selected_ecosystem.url, selected_ecosystem.parser, params))
8991

92+
fpath = Path(DEPENDENCIES_DIR) / f"{ecosystem}.json"
93+
save_data_to_file(list(all_packages), fpath)
9094

95+
96+
@stamina.retry(
97+
on=(httpx.TransportError, httpx.TimeoutException, ServerError),
98+
attempts=10,
99+
wait_jitter=1,
100+
wait_exp_base=2,
101+
wait_max=8,
102+
)
91103
def get_packages(
92-
base_url: str, parser: Callable[[dict[str, Any]], list[str]], params: dict[str, Any] | None = None
93-
) -> list[str]:
94-
for attempt in stamina.retry_context(
95-
on=(httpx.TransportError, httpx.TimeoutException, ServerError),
96-
attempts=5,
97-
wait_jitter=1,
98-
wait_exp_base=2,
99-
wait_max=8,
100-
):
101-
with attempt, httpx.Client(timeout=30) as client:
102-
response = client.get(str(base_url), params=params)
103-
try:
104-
response.raise_for_status()
105-
except httpx.HTTPStatusError as e:
106-
if e.response.is_server_error:
107-
raise ServerError from e
104+
base_url: str, parser: Callable[[dict[str, Any]], set[str]], params: dict[str, Any] | None = None
105+
) -> set[str]:
106+
with httpx.Client(timeout=TIMEOUT) as client:
107+
response = client.get(str(base_url), params=params)
108+
try:
109+
response.raise_for_status()
110+
except httpx.HTTPStatusError as e:
111+
if e.response.is_server_error:
112+
raise ServerError from e
108113
return parser(response.json())
109114

110115

@@ -113,7 +118,7 @@ def save_data_to_file(all_packages: list[str], fpath: Path) -> None:
113118
with open(str(fpath), "w") as fp:
114119
json.dump(data, fp)
115120

116-
logger.info("Saved %d packages to `%s` file.", len(set(all_packages)), fpath)
121+
logger.info("Saved %d packages to `%s` file.", len(all_packages), fpath)
117122

118123

119124
if __name__ == "__main__":

0 commit comments

Comments
 (0)