11import json
22import logging
3+ from abc import ABC , abstractmethod
34from collections .abc import Callable
45from dataclasses import dataclass
56from datetime import datetime
1011import click
1112import httpx
1213import stamina
14+ from pydantic import BaseModel
15+ from typing_extensions import Self , override
1316
1417logger = logging .getLogger ("weekly_download" )
1518logging .basicConfig (
1922)
2023
2124
25+ DEPENDENCIES_DIR = "dependencies"
26+ TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
27+ TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
28+
29+
30+ class BaseDataInterface (BaseModel , ABC ):
31+ packages : list [str ]
32+ date : str = datetime .now (ZoneInfo ("UTC" )).isoformat ()
33+
34+ @classmethod
35+ @abstractmethod
36+ def from_packages_list (cls , packages : list [str ]) -> Self : ...
37+
38+
39+ class SimpleDataInterface (BaseDataInterface ):
40+ @override
41+ @classmethod
42+ def from_packages_list (cls , packages ) -> Self :
43+ return cls (packages = packages )
44+
45+
46+ class NpmFormattedDataInterface (BaseDataInterface ):
47+ namespaces : dict [str , list [str ]] # contains `namespace` as key, `packages` as strings in a list.
48+
49+ @override
50+ @classmethod
51+ def from_packages_list (cls , packages : list [str ]) -> Self :
52+ namespaces : dict [str , list [str ]] = {}
53+ non_namespace_packages = []
54+
55+ for package in packages :
56+ if package .startswith ("@" ):
57+ namespace , package_name = package .split ("/" )
58+ if namespace not in namespaces :
59+ namespaces [namespace ] = []
60+ namespaces [namespace ].append (package_name )
61+ else :
62+ non_namespace_packages .append (package )
63+ return cls (packages = non_namespace_packages , namespaces = namespaces )
64+
65+
2266def parse_npm (data : list [dict [str , Any ]]) -> list [str ]:
2367 return [x ["name" ] for x in data ]
2468
@@ -36,26 +80,39 @@ class Ecosystem:
3680 url : str
3781 params : dict [str , Any ] | None
3882 pages : int | None
39- parser : Callable [[dict [str , Any ]], list [str ]]
83+ parser : Callable [[Any ], list [str ]]
84+ data_interface : type [BaseDataInterface ]
4085
4186
42- @ dataclass ( frozen = True )
43- class PypiEcosystem ( Ecosystem ):
44- url = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
45- params = None
46- pages = None
47- parser = parse_pypi
48-
87+ pypi_ecosystem = Ecosystem (
88+ url = TOP_PYPI_SOURCE ,
89+ params = None ,
90+ pages = None ,
91+ parser = parse_pypi ,
92+ data_interface = SimpleDataInterface ,
93+ )
4994
50- @dataclass (frozen = True )
51- class NpmEcosystem (Ecosystem ):
52- url = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
53- params = {"per_page" : 1000 , "sort" : "downloads" }
54- pages = 15
55- parser = parse_npm
95+ npm_ecosystem = Ecosystem (
96+ url = TOP_NPM_SOURCE ,
97+ params = {"per_page" : 100 , "sort" : "downloads" },
98+ pages = 150 ,
99+ parser = parse_npm ,
100+ data_interface = SimpleDataInterface ,
101+ )
56102
103+ npm_formatted_ecosystem = Ecosystem (
104+ url = TOP_NPM_SOURCE ,
105+ params = {"per_page" : 100 , "sort" : "downloads" },
106+ pages = 150 ,
107+ parser = parse_npm ,
108+ data_interface = NpmFormattedDataInterface ,
109+ )
57110
58- ECOSYSTEMS = {"pypi" : PypiEcosystem , "npm" : NpmEcosystem }
111+ ECOSYSTEMS : dict [str , Ecosystem ] = {
112+ "pypi" : pypi_ecosystem ,
113+ "npm" : npm_ecosystem ,
114+ "npm_formatted" : npm_formatted_ecosystem ,
115+ }
59116
60117
61118@click .group ()
@@ -72,33 +129,38 @@ def entry_point() -> None:
72129def download (
73130 ecosystem : str ,
74131) -> None :
75- selected_ecosystem = ECOSYSTEMS [ecosystem ]
132+ if ecosystem not in ECOSYSTEMS :
133+ raise click .BadParameter ("Not a valid ecosystem" )
76134
77- if pages := selected_ecosystem . pages :
78- all_packages : list [str ] = []
135+ selected_ecosystem = ECOSYSTEMS [ ecosystem ]
136+ all_packages : list [str ] = []
79137
80- for page in range (1 , pages + 1 ):
81- params = selected_ecosystem .params or {}
138+ n_pages = selected_ecosystem .pages or 1
139+ for page in range (1 , n_pages + 1 ):
140+ params = selected_ecosystem .params or {}
141+ if selected_ecosystem .pages :
82142 params ["page" ] = page
83- all_packages .extend (get_packages (selected_ecosystem .url , selected_ecosystem .parser , params ))
84- else :
85- all_packages = get_packages (selected_ecosystem .url , selected_ecosystem .parser , selected_ecosystem .params )
86143
87- fpath = Path ("dependencies" ) / f"{ ecosystem } .json"
88- save_data_to_file (all_packages , fpath )
144+ all_packages .extend (get_packages (selected_ecosystem .url , selected_ecosystem .parser , params ))
145+
146+ fpath = Path (DEPENDENCIES_DIR ) / f"{ ecosystem } .json"
147+ data = selected_ecosystem .data_interface .from_packages_list (all_packages )
148+ save_data_to_file (data , fpath )
89149
90150
91151def get_packages (
92- base_url : str , parser : Callable [[dict [str , Any ]], list [str ]], params : dict [str , Any ] | None = None
152+ base_url : str ,
153+ parser : Callable [[dict [str , Any ]], list [str ]],
154+ params : dict [str , Any ] | None = None ,
93155) -> list [str ]:
94156 for attempt in stamina .retry_context (
95157 on = (httpx .TransportError , httpx .TimeoutException , ServerError ),
96- attempts = 5 ,
158+ attempts = 10 ,
97159 wait_jitter = 1 ,
98160 wait_exp_base = 2 ,
99161 wait_max = 8 ,
100162 ):
101- with attempt , httpx .Client (timeout = 30 ) as client :
163+ with attempt , httpx .Client (timeout = 90 ) as client :
102164 response = client .get (str (base_url ), params = params )
103165 try :
104166 response .raise_for_status ()
@@ -108,12 +170,14 @@ def get_packages(
108170 return parser (response .json ())
109171
110172
111- def save_data_to_file (all_packages : list [str ], fpath : Path ) -> None :
112- data = {"date" : datetime .now (ZoneInfo ("UTC" )).isoformat (), "packages" : all_packages }
173+ def save_data_to_file (
174+ data : BaseDataInterface ,
175+ fpath : Path ,
176+ ) -> None :
113177 with open (str (fpath ), "w" ) as fp :
114- json .dump (data , fp )
178+ json .dump (data . model_dump () , fp )
115179
116- logger .info ("Saved %d packages to `%s` file." , len ( set ( all_packages )) , fpath )
180+ logger .info ("Saved packages to `%s` file." , fpath )
117181
118182
119183if __name__ == "__main__" :
0 commit comments