Skip to content

Commit b5bb6d6

Browse files
authored
refactor: paginate npm (#342)
1 parent 10772eb commit b5bb6d6

File tree

3 files changed

+67
-26
lines changed

3 files changed

+67
-26
lines changed

dependencies/npm.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dependencies/pypi.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dependencies/scripts/download_packages.py

Lines changed: 65 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import json
22
import logging
3+
from dataclasses import dataclass
34
from datetime import datetime
45
from pathlib import Path
5-
from typing import Any
6+
from typing import Any, Callable, Optional
67
from zoneinfo import ZoneInfo
78

89
import click
@@ -25,16 +26,35 @@ def parse_pypi(data: dict[str, Any]) -> list[str]:
2526
return [row["project"] for row in data["rows"]]
2627

2728

28-
ECOSYSTEMS = {
29-
"npm": {
30-
"url": "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages?per_page=10000&page=1&sort=downloads",
31-
"parser": parse_npm,
32-
},
33-
"pypi": {
34-
"url": "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json",
35-
"parser": parse_pypi,
36-
},
37-
}
29+
class ServerError(Exception):
30+
"""Custom exception for HTTP 5xx errors."""
31+
32+
33+
@dataclass(frozen=True)
34+
class Ecosystem:
35+
url: str
36+
params: Optional[dict[str, Any]]
37+
pages: Optional[int]
38+
parser: Callable[[dict[str, Any]], list[str]]
39+
40+
41+
@dataclass(frozen=True)
42+
class PypiEcosystem(Ecosystem):
43+
url = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
44+
params = None
45+
pages = None
46+
parser = parse_pypi
47+
48+
49+
@dataclass(frozen=True)
50+
class NpmEcosystem(Ecosystem):
51+
url = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
52+
params = {"per_page": 1000, "sort": "downloads"}
53+
pages = 15
54+
parser = parse_npm
55+
56+
57+
ECOSYSTEMS = {"pypi": PypiEcosystem, "npm": NpmEcosystem}
3858

3959

4060
@click.group()
@@ -48,30 +68,51 @@ def entry_point() -> None:
4868
type=str,
4969
required=True,
5070
)
51-
def download(ecosystem: str) -> None:
71+
def download(
72+
ecosystem: str,
73+
) -> None:
74+
selected_ecosystem = ECOSYSTEMS[ecosystem]
75+
76+
if pages := selected_ecosystem.pages:
77+
all_packages: list[str] = []
78+
79+
for page in range(1, pages + 1):
80+
params = selected_ecosystem.params or {}
81+
params["page"] = page
82+
all_packages.extend(get_packages(selected_ecosystem.url, selected_ecosystem.parser, params))
83+
else:
84+
all_packages = get_packages(selected_ecosystem.url, selected_ecosystem.parser, selected_ecosystem.params)
85+
86+
fpath = Path("dependencies") / f"{ecosystem}.json"
87+
save_data_to_file(all_packages, fpath)
88+
89+
90+
def get_packages(
91+
base_url: str, parser: Callable[[dict[str, Any]], list[str]], params: Optional[dict[str, Any]] = None
92+
) -> list[str]:
5293
for attempt in stamina.retry_context(
53-
on=(
54-
httpx.TransportError,
55-
httpx.TimeoutException,
56-
),
57-
attempts=3,
94+
on=(httpx.TransportError, httpx.TimeoutException, ServerError),
95+
attempts=5,
5896
wait_jitter=1,
5997
wait_exp_base=2,
6098
wait_max=8,
6199
):
62100
with attempt, httpx.Client(timeout=30) as client:
63-
logger.info("Attempting to download %s packages. Attempt #%d.", ecosystem, attempt.num)
64-
response = client.get(str(ECOSYSTEMS[ecosystem]["url"]))
65-
response.raise_for_status()
101+
response = client.get(str(base_url), params=params)
102+
try:
103+
response.raise_for_status()
104+
except httpx.HTTPStatusError as e:
105+
if e.response.is_server_error:
106+
raise ServerError from e
107+
return parser(response.json())
66108

67-
fpath = Path("dependencies") / f"{ecosystem}.json"
68109

69-
packages = ECOSYSTEMS[ecosystem]["parser"](response.json()) # type: ignore[operator]
70-
data = {"date": datetime.now(ZoneInfo("UTC")).isoformat(), "packages": packages}
110+
def save_data_to_file(all_packages: list[str], fpath: Path) -> None:
111+
data = {"date": datetime.now(ZoneInfo("UTC")).isoformat(), "packages": all_packages}
71112
with open(str(fpath), "w") as fp:
72113
json.dump(data, fp)
73114

74-
logger.info("Saved `%s` file.", fpath)
115+
logger.info("Saved %d packages to `%s` file.", len(set(all_packages)), fpath)
75116

76117

77118
if __name__ == "__main__":

0 commit comments

Comments
 (0)