1919)
2020
2121
22+ class ServerError (Exception ):
23+ """Custom exception for HTTP 5xx errors."""
24+
25+
26+ class InvalidJSONError (Exception ):
27+ """Custom exception for when the received JSON does not match the expected format."""
28+
29+
30+ # Directory name
2231DEPENDENCIES_DIR = "dependencies"
32+ """Directory name where dependency files will be saved."""
33+
34+ # Sources
2335TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
36+ """URL for fetching top PyPI packages data."""
37+
2438TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
39+ """URL for fetching top npm packages data from ecosyste.ms."""
40+
41+ # Retry constants
42+ RETRY_ON = (httpx .TransportError , httpx .TimeoutException , ServerError )
43+ """Tuple of exceptions that should trigger retry attempts."""
44+
45+ RETRY_ATTEMPTS = 15
46+ """Maximum number of retry attempts for failed requests."""
47+
48+ RETRY_WAIT_JITTER = 1
49+ """Random jitter factor for retry wait times."""
50+
51+ RETRY_WAIT_EXP_BASE = 2
52+ """Exponential backoff base multiplier for retry wait times."""
53+
54+ RETRY_WAIT_MAX = 8
55+ """Maximum wait time between retry attempts in seconds."""
56+
2557TIMEOUT = 90
58+ """HTTP request timeout in seconds."""
2659
2760
2861def parse_npm (data : list [dict [str , Any ]]) -> set [str ]:
29- return {x ["name" ] for x in data }
62+ """Parse npm package data and extract package names."""
63+ try :
64+ return {x ["name" ] for x in data }
65+ except KeyError as e :
66+ raise InvalidJSONError from e
3067
3168
3269def parse_pypi (data : dict [str , Any ]) -> set [str ]:
33- return { row [ "project" ] for row in data [ "rows" ]}
34-
35-
36- class ServerError ( Exception ) :
37- """Custom exception for HTTP 5xx errors."""
70+ """Parse PyPI package data and extract package names."""
71+ try :
72+ return { row [ "project" ] for row in data [ "rows" ]}
73+ except KeyError as e :
74+ raise InvalidJSONError from e
3875
3976
4077@dataclass (frozen = True )
4178class Ecosystem :
79+ """Configuration for a package ecosystem (PyPI, npm, etc.)."""
80+
4281 url : str
4382 parser : Callable [[Any ], set [str ]]
4483 params : dict [str , Any ] = field (default_factory = dict )
@@ -49,20 +88,51 @@ class Ecosystem:
4988 url = TOP_PYPI_SOURCE ,
5089 parser = parse_pypi ,
5190)
91+ """Ecosystem configuration for PyPI packages."""
5292
5393npm_ecosystem = Ecosystem (
5494 url = TOP_NPM_SOURCE ,
5595 parser = parse_npm ,
5696 params = {"per_page" : 100 , "sort" : "downloads" },
5797 pages = 150 ,
5898)
99+ """Ecosystem configuration for npm packages with pagination."""
59100
60101
61102ECOSYSTEMS = {"pypi" : pypi_ecosystem , "npm" : npm_ecosystem }
103+ """Dictionary mapping ecosystem names to their configurations."""
104+
105+
106+ def get_params (params : dict [str , Any ] | None , page : int | None ) -> dict [str , Any ]:
107+ """Combine base parameters with page parameter if provided."""
108+ new_params : dict [str , Any ] = {}
109+ if params :
110+ new_params |= params
111+
112+ if page :
113+ new_params ["page" ] = page
114+
115+ return new_params
116+
117+
118+ def _run (ecosystem : str ) -> None :
119+ """Download packages for the specified ecosystem and save to file."""
120+ selected_ecosystem = ECOSYSTEMS [ecosystem ]
121+ all_packages : set [str ] = set ()
122+
123+ n_pages = selected_ecosystem .pages or 1
124+ with httpx .Client (timeout = TIMEOUT ) as client :
125+ for page in range (1 , n_pages + 1 ):
126+ params = get_params (selected_ecosystem .params , page if selected_ecosystem .pages else None )
127+ all_packages .update (get_packages (client , selected_ecosystem .url , selected_ecosystem .parser , params ))
128+
129+ fpath = Path (DEPENDENCIES_DIR ) / f"{ ecosystem } .json"
130+ save_data_to_file (list (all_packages ), fpath )
62131
63132
64133@click .group ()
65134def entry_point () -> None :
135+ """Entry point for the CLI application."""
66136 pass
67137
68138
@@ -75,45 +145,44 @@ def entry_point() -> None:
75145def download (
76146 ecosystem : str ,
77147) -> None :
148+ """Download packages for the specified ecosystem."""
78149 if ecosystem not in ECOSYSTEMS :
79150 raise click .BadParameter ("Not a valid ecosystem" )
80151
81- selected_ecosystem = ECOSYSTEMS [ecosystem ]
82- all_packages : set [str ] = set ()
83-
84- n_pages = selected_ecosystem .pages or 1
85- params = selected_ecosystem .params .copy ()
86- for page in range (1 , n_pages + 1 ):
87- if selected_ecosystem .pages :
88- params ["page" ] = page
89-
90- all_packages .update (get_packages (selected_ecosystem .url , selected_ecosystem .parser , params ))
91-
92- fpath = Path (DEPENDENCIES_DIR ) / f"{ ecosystem } .json"
93- save_data_to_file (list (all_packages ), fpath )
152+ return _run (ecosystem )
94153
95154
96- @stamina .retry (
97- on = (httpx .TransportError , httpx .TimeoutException , ServerError ),
98- attempts = 10 ,
99- wait_jitter = 1 ,
100- wait_exp_base = 2 ,
101- wait_max = 8 ,
102- )
103155def get_packages (
104- base_url : str , parser : Callable [[dict [str , Any ]], set [str ]], params : dict [str , Any ] | None = None
156+ client : httpx .Client ,
157+ base_url : str ,
158+ parser : Callable [[dict [str , Any ]], set [str ]],
159+ params : dict [str , Any ] | None = None ,
105160) -> set [str ]:
106- with httpx .Client (timeout = TIMEOUT ) as client :
107- response = client .get (str (base_url ), params = params )
108- try :
109- response .raise_for_status ()
110- except httpx .HTTPStatusError as e :
111- if e .response .is_server_error :
112- raise ServerError from e
113- return parser (response .json ())
161+ """Fetch and parse package data from a URL with retry logic."""
162+ for attempt in stamina .retry_context (
163+ on = RETRY_ON ,
164+ attempts = RETRY_ATTEMPTS ,
165+ wait_jitter = RETRY_WAIT_JITTER ,
166+ wait_exp_base = RETRY_WAIT_EXP_BASE ,
167+ wait_max = RETRY_WAIT_MAX ,
168+ ):
169+ with attempt :
170+ response = client .get (str (base_url ), params = params )
171+ try :
172+ response .raise_for_status ()
173+ except httpx .HTTPStatusError as e :
174+ if e .response .is_server_error :
175+ raise ServerError from e
176+ try :
177+ json_data = response .json ()
178+ except json .JSONDecodeError as e :
179+ raise InvalidJSONError from e
180+
181+ return parser (json_data )
114182
115183
116184def save_data_to_file (all_packages : list [str ], fpath : Path ) -> None :
185+ """Save package data to a JSON file with timestamp."""
117186 data = {"date" : datetime .now (ZoneInfo ("UTC" )).isoformat (), "packages" : all_packages }
118187 with open (str (fpath ), "w" ) as fp :
119188 json .dump (data , fp )
0 commit comments