Skip to content

Commit d7ac402

Browse files
authored
ci: Improve weekly download flow (#381)
This PR increases the number of retries to be performed during the download of packages, as well as modifies the CI job to use `continue-on-error` and adds some tests, since the functionality is growing.
1 parent 867e837 commit d7ac402

File tree

7 files changed

+409
-67
lines changed

7 files changed

+409
-67
lines changed

.github/workflows/test.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,18 @@ jobs:
2727

2828
- name: Run tests
2929
run: uv run pytest tests
30+
31+
test-weekly-download:
32+
runs-on: ubuntu-latest
33+
if: "!startsWith(github.event.head_commit.message, 'bump:')"
34+
steps:
35+
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
36+
37+
- name: Install uv
38+
uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2
39+
40+
- name: Install the dependencies
41+
run: uv sync --locked --group download --group dev --python 3.14
42+
43+
- name: Run tests
44+
run: uv run pytest dependencies/tests

.github/workflows/weekly_download.yml

Lines changed: 35 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,46 @@ name: Weekly download
22

33
on:
44
schedule:
5-
- cron: "0 0 * * 1" # every Monday at 00:00 UTC
5+
- cron: "0 0 * * 1" # every Monday at 00:00 UTC
66
workflow_dispatch:
77

8-
98
jobs:
109
download:
1110
runs-on: ubuntu-latest
1211
steps:
13-
- uses: actions/create-github-app-token@67018539274d69449ef7c02e8e71183d1719ab42 # v2.1.4
14-
id: app-token
15-
with:
16-
app-id: ${{ vars.ELEMENTSINTERACTIVE_BOT_APP_ID }}
17-
private-key: ${{ secrets.ELEMENTSINTERACTIVE_BOT_PRIVATE_KEY }}
18-
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
19-
with:
20-
fetch-depth: 0
21-
token: ${{ steps.app-token.outputs.token }}
22-
ref: ${{ github.head_ref }}
23-
- name: Install uv
24-
uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2
12+
- uses: actions/create-github-app-token@67018539274d69449ef7c02e8e71183d1719ab42 # v2.1.4
13+
id: app-token
14+
with:
15+
app-id: ${{ vars.ELEMENTSINTERACTIVE_BOT_APP_ID }}
16+
private-key: ${{ secrets.ELEMENTSINTERACTIVE_BOT_PRIVATE_KEY }}
17+
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
18+
with:
19+
fetch-depth: 0
20+
token: ${{ steps.app-token.outputs.token }}
21+
ref: ${{ github.head_ref }}
22+
- name: Install uv
23+
uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2
24+
25+
- name: Install the project
26+
run: uv sync --locked --only-group download --python 3.14
27+
28+
- name: Download Pypi packages
29+
continue-on-error: true
30+
run: |
31+
uv run --no-project dependencies/scripts/download_packages.py download pypi
32+
33+
- name: Download NPM packages
34+
continue-on-error: true
35+
run: |
36+
uv run --no-project dependencies/scripts/download_packages.py download npm
2537
26-
- name: Install the project
27-
run: uv sync --locked --only-group download
38+
- name: Configure git
39+
run: |
40+
git config user.name "github-actions[bot]"
41+
git config user.email "github-actions[bot]@users.noreply.github.com"
2842
29-
- name: Download packages from trusted sources
30-
run: |
31-
uv run --no-project dependencies/scripts/download_packages.py download pypi || echo 'Failed to download trusted pypi packages'
32-
uv run --no-project dependencies/scripts/download_packages.py download npm || echo 'Failed to download trusted npm packages'
33-
34-
- name: Configure git
35-
run: |
36-
git config user.name "github-actions[bot]"
37-
git config user.email "github-actions[bot]@users.noreply.github.com"
38-
39-
- name: Push changes to repo
40-
run: |
41-
git add .
42-
git commit -m "chore: Weekly update of trusted packages"
43-
git push origin HEAD:main
43+
- name: Push changes to repo
44+
run: |
45+
git add .
46+
git commit -m "chore: Weekly update of trusted packages"
47+
git push origin HEAD:main

dependencies/scripts/__init__.py

Whitespace-only changes.

dependencies/scripts/download_packages.py

Lines changed: 104 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,65 @@
1919
)
2020

2121

22+
class ServerError(Exception):
23+
"""Custom exception for HTTP 5xx errors."""
24+
25+
26+
class InvalidJSONError(Exception):
27+
"""Custom exception for when the received JSON does not match the expected format."""
28+
29+
30+
# Directory name
2231
DEPENDENCIES_DIR = "dependencies"
32+
"""Directory name where dependency files will be saved."""
33+
34+
# Sources
2335
TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
36+
"""URL for fetching top PyPI packages data."""
37+
2438
TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
39+
"""URL for fetching top npm packages data from ecosyste.ms."""
40+
41+
# Retry constants
42+
RETRY_ON = (httpx.TransportError, httpx.TimeoutException, ServerError)
43+
"""Tuple of exceptions that should trigger retry attempts."""
44+
45+
RETRY_ATTEMPTS = 15
46+
"""Maximum number of retry attempts for failed requests."""
47+
48+
RETRY_WAIT_JITTER = 1
49+
"""Random jitter factor for retry wait times."""
50+
51+
RETRY_WAIT_EXP_BASE = 2
52+
"""Exponential backoff base multiplier for retry wait times."""
53+
54+
RETRY_WAIT_MAX = 8
55+
"""Maximum wait time between retry attempts in seconds."""
56+
2557
TIMEOUT = 90
58+
"""HTTP request timeout in seconds."""
2659

2760

2861
def parse_npm(data: list[dict[str, Any]]) -> set[str]:
29-
return {x["name"] for x in data}
62+
"""Parse npm package data and extract package names."""
63+
try:
64+
return {x["name"] for x in data}
65+
except KeyError as e:
66+
raise InvalidJSONError from e
3067

3168

3269
def parse_pypi(data: dict[str, Any]) -> set[str]:
33-
return {row["project"] for row in data["rows"]}
34-
35-
36-
class ServerError(Exception):
37-
"""Custom exception for HTTP 5xx errors."""
70+
"""Parse PyPI package data and extract package names."""
71+
try:
72+
return {row["project"] for row in data["rows"]}
73+
except KeyError as e:
74+
raise InvalidJSONError from e
3875

3976

4077
@dataclass(frozen=True)
4178
class Ecosystem:
79+
"""Configuration for a package ecosystem (PyPI, npm, etc.)."""
80+
4281
url: str
4382
parser: Callable[[Any], set[str]]
4483
params: dict[str, Any] = field(default_factory=dict)
@@ -49,20 +88,51 @@ class Ecosystem:
4988
url=TOP_PYPI_SOURCE,
5089
parser=parse_pypi,
5190
)
91+
"""Ecosystem configuration for PyPI packages."""
5292

5393
npm_ecosystem = Ecosystem(
5494
url=TOP_NPM_SOURCE,
5595
parser=parse_npm,
5696
params={"per_page": 100, "sort": "downloads"},
5797
pages=150,
5898
)
99+
"""Ecosystem configuration for npm packages with pagination."""
59100

60101

61102
ECOSYSTEMS = {"pypi": pypi_ecosystem, "npm": npm_ecosystem}
103+
"""Dictionary mapping ecosystem names to their configurations."""
104+
105+
106+
def get_params(params: dict[str, Any] | None, page: int | None) -> dict[str, Any]:
107+
"""Combine base parameters with page parameter if provided."""
108+
new_params: dict[str, Any] = {}
109+
if params:
110+
new_params |= params
111+
112+
if page:
113+
new_params["page"] = page
114+
115+
return new_params
116+
117+
118+
def _run(ecosystem: str) -> None:
119+
"""Download packages for the specified ecosystem and save to file."""
120+
selected_ecosystem = ECOSYSTEMS[ecosystem]
121+
all_packages: set[str] = set()
122+
123+
n_pages = selected_ecosystem.pages or 1
124+
with httpx.Client(timeout=TIMEOUT) as client:
125+
for page in range(1, n_pages + 1):
126+
params = get_params(selected_ecosystem.params, page if selected_ecosystem.pages else None)
127+
all_packages.update(get_packages(client, selected_ecosystem.url, selected_ecosystem.parser, params))
128+
129+
fpath = Path(DEPENDENCIES_DIR) / f"{ecosystem}.json"
130+
save_data_to_file(list(all_packages), fpath)
62131

63132

64133
@click.group()
65134
def entry_point() -> None:
135+
"""Entry point for the CLI application."""
66136
pass
67137

68138

@@ -75,45 +145,44 @@ def entry_point() -> None:
75145
def download(
76146
ecosystem: str,
77147
) -> None:
148+
"""Download packages for the specified ecosystem."""
78149
if ecosystem not in ECOSYSTEMS:
79150
raise click.BadParameter("Not a valid ecosystem")
80151

81-
selected_ecosystem = ECOSYSTEMS[ecosystem]
82-
all_packages: set[str] = set()
83-
84-
n_pages = selected_ecosystem.pages or 1
85-
params = selected_ecosystem.params.copy()
86-
for page in range(1, n_pages + 1):
87-
if selected_ecosystem.pages:
88-
params["page"] = page
89-
90-
all_packages.update(get_packages(selected_ecosystem.url, selected_ecosystem.parser, params))
91-
92-
fpath = Path(DEPENDENCIES_DIR) / f"{ecosystem}.json"
93-
save_data_to_file(list(all_packages), fpath)
152+
return _run(ecosystem)
94153

95154

96-
@stamina.retry(
97-
on=(httpx.TransportError, httpx.TimeoutException, ServerError),
98-
attempts=10,
99-
wait_jitter=1,
100-
wait_exp_base=2,
101-
wait_max=8,
102-
)
103155
def get_packages(
104-
base_url: str, parser: Callable[[dict[str, Any]], set[str]], params: dict[str, Any] | None = None
156+
client: httpx.Client,
157+
base_url: str,
158+
parser: Callable[[dict[str, Any]], set[str]],
159+
params: dict[str, Any] | None = None,
105160
) -> set[str]:
106-
with httpx.Client(timeout=TIMEOUT) as client:
107-
response = client.get(str(base_url), params=params)
108-
try:
109-
response.raise_for_status()
110-
except httpx.HTTPStatusError as e:
111-
if e.response.is_server_error:
112-
raise ServerError from e
113-
return parser(response.json())
161+
"""Fetch and parse package data from a URL with retry logic."""
162+
for attempt in stamina.retry_context(
163+
on=RETRY_ON,
164+
attempts=RETRY_ATTEMPTS,
165+
wait_jitter=RETRY_WAIT_JITTER,
166+
wait_exp_base=RETRY_WAIT_EXP_BASE,
167+
wait_max=RETRY_WAIT_MAX,
168+
):
169+
with attempt:
170+
response = client.get(str(base_url), params=params)
171+
try:
172+
response.raise_for_status()
173+
except httpx.HTTPStatusError as e:
174+
if e.response.is_server_error:
175+
raise ServerError from e
176+
try:
177+
json_data = response.json()
178+
except json.JSONDecodeError as e:
179+
raise InvalidJSONError from e
180+
181+
return parser(json_data)
114182

115183

116184
def save_data_to_file(all_packages: list[str], fpath: Path) -> None:
185+
"""Save package data to a JSON file with timestamp."""
117186
data = {"date": datetime.now(ZoneInfo("UTC")).isoformat(), "packages": all_packages}
118187
with open(str(fpath), "w") as fp:
119188
json.dump(data, fp)

dependencies/tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)