11import json
22import logging
3+ import select
34from collections .abc import Callable
4- from dataclasses import dataclass
5+ from dataclasses import dataclass , field
56from datetime import datetime
67from pathlib import Path
78from typing import Any
1920)
2021
2122
22- def parse_npm (data : list [dict [str , Any ]]) -> list [str ]:
23- return [x ["name" ] for x in data ]
23+ DEPENDENCIES_DIR = "dependencies"
24+ TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
25+ TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
26+ TIMEOUT = 90
2427
2528
26- def parse_pypi (data : dict [str , Any ]) -> list [str ]:
27- return [row ["project" ] for row in data ["rows" ]]
29+ def parse_npm (data : list [dict [str , Any ]]) -> set [str ]:
30+ return {x ["name" ] for x in data }
31+
32+
33+ def parse_pypi (data : dict [str , Any ]) -> set [str ]:
34+ return {row ["project" ] for row in data ["rows" ]}
2835
2936
3037class ServerError (Exception ):
@@ -34,28 +41,25 @@ class ServerError(Exception):
3441@dataclass (frozen = True )
3542class Ecosystem :
3643 url : str
37- params : dict [str , Any ] | None
38- pages : int | None
39- parser : Callable [[dict [str , Any ]], list [str ]]
40-
44+ parser : Callable [[Any ], set [str ]]
45+ params : dict [str , Any ] = field (default_factory = dict )
46+ pages : int | None = None
4147
42- @dataclass (frozen = True )
43- class PypiEcosystem (Ecosystem ):
44- url = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
45- params = None
46- pages = None
47- parser = parse_pypi
4848
49+ pypi_ecosystem = Ecosystem (
50+ url = TOP_PYPI_SOURCE ,
51+ parser = parse_pypi ,
52+ )
4953
50- @ dataclass ( frozen = True )
51- class NpmEcosystem ( Ecosystem ):
52- url = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
53- params = {"per_page" : 1000 , "sort" : "downloads" }
54- pages = 15
55- parser = parse_npm
54+ npm_ecosystem = Ecosystem (
55+ url = TOP_NPM_SOURCE ,
56+ parser = parse_npm ,
57+ params = {"per_page" : 100 , "sort" : "downloads" },
58+ pages = 150 ,
59+ )
5660
5761
58- ECOSYSTEMS = {"pypi" : PypiEcosystem , "npm" : NpmEcosystem }
62+ ECOSYSTEMS = {"pypi" : pypi_ecosystem , "npm" : npm_ecosystem }
5963
6064
6165@click .group ()
@@ -72,39 +76,41 @@ def entry_point() -> None:
7276def download (
7377 ecosystem : str ,
7478) -> None :
75- selected_ecosystem = ECOSYSTEMS [ecosystem ]
79+ if ecosystem not in ECOSYSTEMS :
80+ raise click .BadParameter ("Not a valid ecosystem" )
7681
77- if pages := selected_ecosystem . pages :
78- all_packages : list [str ] = []
82+ selected_ecosystem = ECOSYSTEMS [ ecosystem ]
83+ all_packages : set [str ] = set ()
7984
80- for page in range (1 , pages + 1 ):
81- params = selected_ecosystem .params or {}
85+ n_pages = selected_ecosystem .pages or 1
86+ params = selected_ecosystem .params .copy ()
87+ for page in range (1 , n_pages + 1 ):
88+ if selected_ecosystem .pages :
8289 params ["page" ] = page
83- all_packages .extend (get_packages (selected_ecosystem .url , selected_ecosystem .parser , params ))
84- else :
85- all_packages = get_packages (selected_ecosystem .url , selected_ecosystem .parser , selected_ecosystem .params )
8690
87- fpath = Path ("dependencies" ) / f"{ ecosystem } .json"
88- save_data_to_file (all_packages , fpath )
91+ all_packages .union (get_packages (selected_ecosystem .url , selected_ecosystem .parser , params ))
8992
93+ fpath = Path (DEPENDENCIES_DIR ) / f"{ ecosystem } .json"
94+ save_data_to_file (list (all_packages ), fpath )
9095
96+
97+ @stamina .retry (
98+ on = (httpx .TransportError , httpx .TimeoutException , ServerError ),
99+ attempts = 10 ,
100+ wait_jitter = 1 ,
101+ wait_exp_base = 2 ,
102+ wait_max = 8 ,
103+ )
91104def get_packages (
92- base_url : str , parser : Callable [[dict [str , Any ]], list [str ]], params : dict [str , Any ] | None = None
93- ) -> list [str ]:
94- for attempt in stamina .retry_context (
95- on = (httpx .TransportError , httpx .TimeoutException , ServerError ),
96- attempts = 5 ,
97- wait_jitter = 1 ,
98- wait_exp_base = 2 ,
99- wait_max = 8 ,
100- ):
101- with attempt , httpx .Client (timeout = 30 ) as client :
102- response = client .get (str (base_url ), params = params )
103- try :
104- response .raise_for_status ()
105- except httpx .HTTPStatusError as e :
106- if e .response .is_server_error :
107- raise ServerError from e
105+ base_url : str , parser : Callable [[dict [str , Any ]], set [str ]], params : dict [str , Any ] | None = None
106+ ) -> set [str ]:
107+ with httpx .Client (timeout = TIMEOUT ) as client :
108+ response = client .get (str (base_url ), params = params )
109+ try :
110+ response .raise_for_status ()
111+ except httpx .HTTPStatusError as e :
112+ if e .response .is_server_error :
113+ raise ServerError from e
108114 return parser (response .json ())
109115
110116
@@ -113,7 +119,7 @@ def save_data_to_file(all_packages: list[str], fpath: Path) -> None:
113119 with open (str (fpath ), "w" ) as fp :
114120 json .dump (data , fp )
115121
116- logger .info ("Saved %d packages to `%s` file." , len (set ( all_packages ) ), fpath )
122+ logger .info ("Saved %d packages to `%s` file." , len (all_packages ), fpath )
117123
118124
119125if __name__ == "__main__" :
0 commit comments