Skip to content

Commit 232c741

Browse files
committed
ci: improve NPM download parameters
1 parent bdac02f commit 232c741

File tree

3 files changed

+57
-51
lines changed

3 files changed

+57
-51
lines changed

dependencies/npm.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dependencies/pypi.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dependencies/scripts/download_packages.py

Lines changed: 55 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import json
22
import logging
3+
import select
34
from collections.abc import Callable
4-
from dataclasses import dataclass
5+
from dataclasses import dataclass, field
56
from datetime import datetime
67
from pathlib import Path
78
from typing import Any
@@ -19,12 +20,18 @@
1920
)
2021

2122

22-
def parse_npm(data: list[dict[str, Any]]) -> list[str]:
23-
return [x["name"] for x in data]
23+
DEPENDENCIES_DIR = "dependencies"
24+
TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
25+
TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
26+
TIMEOUT = 90
2427

2528

26-
def parse_pypi(data: dict[str, Any]) -> list[str]:
27-
return [row["project"] for row in data["rows"]]
29+
def parse_npm(data: list[dict[str, Any]]) -> set[str]:
30+
return {x["name"] for x in data}
31+
32+
33+
def parse_pypi(data: dict[str, Any]) -> set[str]:
34+
return {row["project"] for row in data["rows"]}
2835

2936

3037
class ServerError(Exception):
@@ -34,28 +41,25 @@ class ServerError(Exception):
3441
@dataclass(frozen=True)
3542
class Ecosystem:
3643
url: str
37-
params: dict[str, Any] | None
38-
pages: int | None
39-
parser: Callable[[dict[str, Any]], list[str]]
40-
44+
parser: Callable[[Any], set[str]]
45+
params: dict[str, Any] = field(default_factory=dict)
46+
pages: int | None = None
4147

42-
@dataclass(frozen=True)
43-
class PypiEcosystem(Ecosystem):
44-
url = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
45-
params = None
46-
pages = None
47-
parser = parse_pypi
4848

49+
pypi_ecosystem = Ecosystem(
50+
url=TOP_PYPI_SOURCE,
51+
parser=parse_pypi,
52+
)
4953

50-
@dataclass(frozen=True)
51-
class NpmEcosystem(Ecosystem):
52-
url = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
53-
params = {"per_page": 1000, "sort": "downloads"}
54-
pages = 15
55-
parser = parse_npm
54+
npm_ecosystem = Ecosystem(
55+
url=TOP_NPM_SOURCE,
56+
parser=parse_npm,
57+
params={"per_page": 100, "sort": "downloads"},
58+
pages=150,
59+
)
5660

5761

58-
ECOSYSTEMS = {"pypi": PypiEcosystem, "npm": NpmEcosystem}
62+
ECOSYSTEMS = {"pypi": pypi_ecosystem, "npm": npm_ecosystem}
5963

6064

6165
@click.group()
@@ -72,39 +76,41 @@ def entry_point() -> None:
7276
def download(
7377
ecosystem: str,
7478
) -> None:
75-
selected_ecosystem = ECOSYSTEMS[ecosystem]
79+
if ecosystem not in ECOSYSTEMS:
80+
raise click.BadParameter("Not a valid ecosystem")
7681

77-
if pages := selected_ecosystem.pages:
78-
all_packages: list[str] = []
82+
selected_ecosystem = ECOSYSTEMS[ecosystem]
83+
all_packages: set[str] = set()
7984

80-
for page in range(1, pages + 1):
81-
params = selected_ecosystem.params or {}
85+
n_pages = selected_ecosystem.pages or 1
86+
params = selected_ecosystem.params.copy()
87+
for page in range(1, n_pages + 1):
88+
if selected_ecosystem.pages:
8289
params["page"] = page
83-
all_packages.extend(get_packages(selected_ecosystem.url, selected_ecosystem.parser, params))
84-
else:
85-
all_packages = get_packages(selected_ecosystem.url, selected_ecosystem.parser, selected_ecosystem.params)
8690

87-
fpath = Path("dependencies") / f"{ecosystem}.json"
88-
save_data_to_file(all_packages, fpath)
91+
all_packages.union(get_packages(selected_ecosystem.url, selected_ecosystem.parser, params))
8992

93+
fpath = Path(DEPENDENCIES_DIR) / f"{ecosystem}.json"
94+
save_data_to_file(list(all_packages), fpath)
9095

96+
97+
@stamina.retry(
98+
on=(httpx.TransportError, httpx.TimeoutException, ServerError),
99+
attempts=10,
100+
wait_jitter=1,
101+
wait_exp_base=2,
102+
wait_max=8,
103+
)
91104
def get_packages(
92-
base_url: str, parser: Callable[[dict[str, Any]], list[str]], params: dict[str, Any] | None = None
93-
) -> list[str]:
94-
for attempt in stamina.retry_context(
95-
on=(httpx.TransportError, httpx.TimeoutException, ServerError),
96-
attempts=5,
97-
wait_jitter=1,
98-
wait_exp_base=2,
99-
wait_max=8,
100-
):
101-
with attempt, httpx.Client(timeout=30) as client:
102-
response = client.get(str(base_url), params=params)
103-
try:
104-
response.raise_for_status()
105-
except httpx.HTTPStatusError as e:
106-
if e.response.is_server_error:
107-
raise ServerError from e
105+
base_url: str, parser: Callable[[dict[str, Any]], set[str]], params: dict[str, Any] | None = None
106+
) -> set[str]:
107+
with httpx.Client(timeout=TIMEOUT) as client:
108+
response = client.get(str(base_url), params=params)
109+
try:
110+
response.raise_for_status()
111+
except httpx.HTTPStatusError as e:
112+
if e.response.is_server_error:
113+
raise ServerError from e
108114
return parser(response.json())
109115

110116

@@ -113,7 +119,7 @@ def save_data_to_file(all_packages: list[str], fpath: Path) -> None:
113119
with open(str(fpath), "w") as fp:
114120
json.dump(data, fp)
115121

116-
logger.info("Saved %d packages to `%s` file.", len(set(all_packages)), fpath)
122+
logger.info("Saved %d packages to `%s` file.", len(all_packages), fpath)
117123

118124

119125
if __name__ == "__main__":

0 commit comments

Comments
 (0)