Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dependencies/npm.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dependencies/pypi.json

Large diffs are not rendered by default.

87 changes: 46 additions & 41 deletions dependencies/scripts/download_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
)


DEPENDENCIES_DIR = "dependencies"
TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
TIMEOUT = 90


def parse_npm(data: list[dict[str, Any]]) -> list[str]:
return [x["name"] for x in data]

Expand All @@ -34,28 +40,25 @@ class ServerError(Exception):
@dataclass(frozen=True)
class Ecosystem:
url: str
params: dict[str, Any] | None
pages: int | None
parser: Callable[[dict[str, Any]], list[str]]

parser: Callable[[Any], list[str]]
params: dict[str, Any] | None = None
pages: int | None = None

@dataclass(frozen=True)
class PypiEcosystem(Ecosystem):
url = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
params = None
pages = None
parser = parse_pypi

pypi_ecosystem = Ecosystem(
url=TOP_PYPI_SOURCE,
parser=parse_pypi,
)

@dataclass(frozen=True)
class NpmEcosystem(Ecosystem):
url = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
params = {"per_page": 1000, "sort": "downloads"}
pages = 15
parser = parse_npm
npm_ecosystem = Ecosystem(
url=TOP_NPM_SOURCE,
parser=parse_npm,
params={"per_page": 100, "sort": "downloads"},
pages=150,
)


ECOSYSTEMS = {"pypi": PypiEcosystem, "npm": NpmEcosystem}
ECOSYSTEMS = {"pypi": pypi_ecosystem, "npm": npm_ecosystem}


@click.group()
Expand All @@ -72,39 +75,41 @@ def entry_point() -> None:
def download(
ecosystem: str,
) -> None:
selected_ecosystem = ECOSYSTEMS[ecosystem]
if ecosystem not in ECOSYSTEMS:
raise click.BadParameter("Not a valid ecosystem")

if pages := selected_ecosystem.pages:
all_packages: list[str] = []
selected_ecosystem = ECOSYSTEMS[ecosystem]
all_packages: list[str] = []

for page in range(1, pages + 1):
params = selected_ecosystem.params or {}
n_pages = selected_ecosystem.pages or 1
for page in range(1, n_pages + 1):
params = selected_ecosystem.params or {}
if selected_ecosystem.pages:
params["page"] = page
all_packages.extend(get_packages(selected_ecosystem.url, selected_ecosystem.parser, params))
else:
all_packages = get_packages(selected_ecosystem.url, selected_ecosystem.parser, selected_ecosystem.params)

fpath = Path("dependencies") / f"{ecosystem}.json"
all_packages.extend(get_packages(selected_ecosystem.url, selected_ecosystem.parser, selected_ecosystem.params))

fpath = Path(DEPENDENCIES_DIR) / f"{ecosystem}.json"
save_data_to_file(all_packages, fpath)


@stamina.retry(
on=(httpx.TransportError, httpx.TimeoutException, ServerError),
attempts=10,
wait_jitter=1,
wait_exp_base=2,
wait_max=8,
)
def get_packages(
base_url: str, parser: Callable[[dict[str, Any]], list[str]], params: dict[str, Any] | None = None
) -> list[str]:
for attempt in stamina.retry_context(
on=(httpx.TransportError, httpx.TimeoutException, ServerError),
attempts=5,
wait_jitter=1,
wait_exp_base=2,
wait_max=8,
):
with attempt, httpx.Client(timeout=30) as client:
response = client.get(str(base_url), params=params)
try:
response.raise_for_status()
except httpx.HTTPStatusError as e:
if e.response.is_server_error:
raise ServerError from e
with httpx.Client(timeout=TIMEOUT) as client:
response = client.get(str(base_url), params=params)
try:
response.raise_for_status()
except httpx.HTTPStatusError as e:
if e.response.is_server_error:
raise ServerError from e
return parser(response.json())


Expand All @@ -113,7 +118,7 @@ def save_data_to_file(all_packages: list[str], fpath: Path) -> None:
with open(str(fpath), "w") as fp:
json.dump(data, fp)

logger.info("Saved %d packages to `%s` file.", len(set(all_packages)), fpath)
logger.info("Saved %d packages to `%s` file.", len(all_packages), fpath)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🦉 ✨ Quality

Severity: LOW 🔵

logger.info("Saved %d packages to `%s` file.", len(all_packages), fpath)

The log message now reports the total number of packages fetched, which might include duplicates if the API returns them across different pages. The previous implementation used len(set(all_packages)) which correctly reports the number of unique packages saved. Reporting the unique count is more informative.

With the current bug in pagination, this change actually hides the issue by reporting a large number of packages when in fact only a small set of unique packages are being fetched repeatedly. Once the pagination is fixed, it would still be better to report on the unique set.

Suggested change
logger.info("Saved %d packages to `%s` file.", len(all_packages), fpath)
logger.info("Saved %d packages to `%s` file.", len(set(all_packages)), fpath)
More information about this comment
  • File: dependencies/scripts/download_packages.py
  • Line: 121
  • Relative line: 126
  • With suggestion: Yes
  • Suggestion ready for replacement: Yes



if __name__ == "__main__":
Expand Down
Loading