diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ade1abe..e3ffc6b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,3 +27,18 @@ jobs: - name: Run tests run: uv run pytest tests + + test-weekly-download: + runs-on: ubuntu-latest + if: "!startsWith(github.event.head_commit.message, 'bump:')" + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + + - name: Install uv + uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2 + + - name: Install the dependencies + run: uv sync --locked --group download --group dev --python 3.14 + + - name: Run tests + run: uv run pytest dependencies/tests diff --git a/.github/workflows/weekly_download.yml b/.github/workflows/weekly_download.yml index 5d4fa99..954ac7c 100644 --- a/.github/workflows/weekly_download.yml +++ b/.github/workflows/weekly_download.yml @@ -2,42 +2,46 @@ name: Weekly download on: schedule: - - cron: "0 0 * * 1" # every Monday at 00:00 UTC + - cron: "0 0 * * 1" # every Monday at 00:00 UTC workflow_dispatch: - jobs: download: runs-on: ubuntu-latest steps: - - uses: actions/create-github-app-token@67018539274d69449ef7c02e8e71183d1719ab42 # v2.1.4 - id: app-token - with: - app-id: ${{ vars.ELEMENTSINTERACTIVE_BOT_APP_ID }} - private-key: ${{ secrets.ELEMENTSINTERACTIVE_BOT_PRIVATE_KEY }} - - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - with: - fetch-depth: 0 - token: ${{ steps.app-token.outputs.token }} - ref: ${{ github.head_ref }} - - name: Install uv - uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2 + - uses: actions/create-github-app-token@67018539274d69449ef7c02e8e71183d1719ab42 # v2.1.4 + id: app-token + with: + app-id: ${{ vars.ELEMENTSINTERACTIVE_BOT_APP_ID }} + private-key: ${{ secrets.ELEMENTSINTERACTIVE_BOT_PRIVATE_KEY }} + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 + token: ${{ steps.app-token.outputs.token }} + ref: ${{ github.head_ref }} + - name: Install uv + uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2 + + - name: Install the project + run: uv sync --locked --only-group download --python 3.14 + + - name: Download Pypi packages + continue-on-error: true + run: | + uv run --no-project dependencies/scripts/download_packages.py download pypi + + - name: Download NPM packages + continue-on-error: true + run: | + uv run --no-project dependencies/scripts/download_packages.py download npm - - name: Install the project - run: uv sync --locked --only-group download + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" - - name: Download packages from trusted sources - run: | - uv run --no-project dependencies/scripts/download_packages.py download pypi || echo 'Failed to download trusted pypi packages' - uv run --no-project dependencies/scripts/download_packages.py download npm || echo 'Failed to download trusted npm packages' - - - name: Configure git - run: | - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - - - name: Push changes to repo - run: | - git add . - git commit -m "chore: Weekly update of trusted packages" - git push origin HEAD:main \ No newline at end of file + - name: Push changes to repo + run: | + git add . + git commit -m "chore: Weekly update of trusted packages" + git push origin HEAD:main diff --git a/dependencies/scripts/__init__.py b/dependencies/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dependencies/scripts/download_packages.py b/dependencies/scripts/download_packages.py index 486eff5..5bce9be 100644 --- a/dependencies/scripts/download_packages.py +++ b/dependencies/scripts/download_packages.py @@ -19,26 +19,65 @@ ) +class ServerError(Exception): + """Custom exception for HTTP 5xx errors.""" + + +class InvalidJSONError(Exception): + """Custom exception for when the received JSON does not match the expected format.""" + + +# Directory name DEPENDENCIES_DIR = "dependencies" +"""Directory name where dependency files will be saved.""" + +# Sources TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json" +"""URL for fetching top PyPI packages data.""" + TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages" +"""URL for fetching top npm packages data from ecosyste.ms.""" + +# Retry constants +RETRY_ON = (httpx.TransportError, httpx.TimeoutException, ServerError) +"""Tuple of exceptions that should trigger retry attempts.""" + +RETRY_ATTEMPTS = 15 +"""Maximum number of retry attempts for failed requests.""" + +RETRY_WAIT_JITTER = 1 +"""Random jitter factor for retry wait times.""" + +RETRY_WAIT_EXP_BASE = 2 +"""Exponential backoff base multiplier for retry wait times.""" + +RETRY_WAIT_MAX = 8 +"""Maximum wait time between retry attempts in seconds.""" + TIMEOUT = 90 +"""HTTP request timeout in seconds.""" def parse_npm(data: list[dict[str, Any]]) -> set[str]: - return {x["name"] for x in data} + """Parse npm package data and extract package names.""" + try: + return {x["name"] for x in data} + except KeyError as e: + raise InvalidJSONError from e def parse_pypi(data: dict[str, Any]) -> set[str]: - return {row["project"] for row in data["rows"]} - - -class ServerError(Exception): - """Custom exception for HTTP 5xx errors.""" + """Parse PyPI package data and extract package names.""" + try: + return {row["project"] for row in data["rows"]} + except KeyError as e: + raise InvalidJSONError from e @dataclass(frozen=True) class Ecosystem: + """Configuration for a package ecosystem (PyPI, npm, etc.).""" + url: str parser: Callable[[Any], set[str]] params: dict[str, Any] = field(default_factory=dict) @@ -49,6 +88,7 @@ class Ecosystem: url=TOP_PYPI_SOURCE, parser=parse_pypi, ) +"""Ecosystem configuration for PyPI packages.""" npm_ecosystem = Ecosystem( url=TOP_NPM_SOURCE, @@ -56,13 +96,43 @@ class Ecosystem: params={"per_page": 100, "sort": "downloads"}, pages=150, ) +"""Ecosystem configuration for npm packages with pagination.""" ECOSYSTEMS = {"pypi": pypi_ecosystem, "npm": npm_ecosystem} +"""Dictionary mapping ecosystem names to their configurations.""" + + +def get_params(params: dict[str, Any] | None, page: int | None) -> dict[str, Any]: + """Combine base parameters with page parameter if provided.""" + new_params: dict[str, Any] = {} + if params: + new_params |= params + + if page: + new_params["page"] = page + + return new_params + + +def _run(ecosystem: str) -> None: + """Download packages for the specified ecosystem and save to file.""" + selected_ecosystem = ECOSYSTEMS[ecosystem] + all_packages: set[str] = set() + + n_pages = selected_ecosystem.pages or 1 + with httpx.Client(timeout=TIMEOUT) as client: + for page in range(1, n_pages + 1): + params = get_params(selected_ecosystem.params, page if selected_ecosystem.pages else None) + all_packages.update(get_packages(client, selected_ecosystem.url, selected_ecosystem.parser, params)) + + fpath = Path(DEPENDENCIES_DIR) / f"{ecosystem}.json" + save_data_to_file(list(all_packages), fpath) @click.group() def entry_point() -> None: + """Entry point for the CLI application.""" pass @@ -75,45 +145,44 @@ def entry_point() -> None: def download( ecosystem: str, ) -> None: + """Download packages for the specified ecosystem.""" if ecosystem not in ECOSYSTEMS: raise click.BadParameter("Not a valid ecosystem") - selected_ecosystem = ECOSYSTEMS[ecosystem] - all_packages: set[str] = set() - - n_pages = selected_ecosystem.pages or 1 - params = selected_ecosystem.params.copy() - for page in range(1, n_pages + 1): - if selected_ecosystem.pages: - params["page"] = page - - all_packages.update(get_packages(selected_ecosystem.url, selected_ecosystem.parser, params)) - - fpath = Path(DEPENDENCIES_DIR) / f"{ecosystem}.json" - save_data_to_file(list(all_packages), fpath) + return _run(ecosystem) -@stamina.retry( - on=(httpx.TransportError, httpx.TimeoutException, ServerError), - attempts=10, - wait_jitter=1, - wait_exp_base=2, - wait_max=8, -) def get_packages( - base_url: str, parser: Callable[[dict[str, Any]], set[str]], params: dict[str, Any] | None = None + client: httpx.Client, + base_url: str, + parser: Callable[[dict[str, Any]], set[str]], + params: dict[str, Any] | None = None, ) -> set[str]: - with httpx.Client(timeout=TIMEOUT) as client: - response = client.get(str(base_url), params=params) - try: - response.raise_for_status() - except httpx.HTTPStatusError as e: - if e.response.is_server_error: - raise ServerError from e - return parser(response.json()) + """Fetch and parse package data from a URL with retry logic.""" + for attempt in stamina.retry_context( + on=RETRY_ON, + attempts=RETRY_ATTEMPTS, + wait_jitter=RETRY_WAIT_JITTER, + wait_exp_base=RETRY_WAIT_EXP_BASE, + wait_max=RETRY_WAIT_MAX, + ): + with attempt: + response = client.get(str(base_url), params=params) + try: + response.raise_for_status() + except httpx.HTTPStatusError as e: + if e.response.is_server_error: + raise ServerError from e + try: + json_data = response.json() + except json.JSONDecodeError as e: + raise InvalidJSONError from e + + return parser(json_data) def save_data_to_file(all_packages: list[str], fpath: Path) -> None: + """Save package data to a JSON file with timestamp.""" data = {"date": datetime.now(ZoneInfo("UTC")).isoformat(), "packages": all_packages} with open(str(fpath), "w") as fp: json.dump(data, fp) diff --git a/dependencies/tests/__init__.py b/dependencies/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dependencies/tests/test_download_packages.py b/dependencies/tests/test_download_packages.py new file mode 100644 index 0000000..16706d9 --- /dev/null +++ b/dependencies/tests/test_download_packages.py @@ -0,0 +1,251 @@ +import json +from collections.abc import Iterator +from contextlib import contextmanager +from pathlib import Path +from typing import Any +from unittest.mock import Mock, call, patch + +import httpx +import pytest +from click.testing import CliRunner +from freezegun import freeze_time +from scripts.download_packages import ( # noqa: E402 + DEPENDENCIES_DIR, + ECOSYSTEMS, + RETRY_ATTEMPTS, + Ecosystem, + InvalidJSONError, + ServerError, + _run, + download, + npm_ecosystem, + parse_npm, + parse_pypi, +) + + +@contextmanager +def patch_client(json_data: Any) -> Iterator[Mock]: + """Context manager that patches httpx.Client.get to return mock data with specified JSON response.""" + with patch("httpx.Client.get") as mock_client: + mock_data = Mock() + mock_data.json.return_value = json_data + mock_client.return_value = mock_data + yield mock_client + + +@contextmanager +def patch_client_error(error: Exception) -> Iterator[Mock]: + """Context manager that patches httpx.Client.get to raise specified error on status check.""" + with patch("httpx.Client.get") as mock_client: + mock_data = Mock() + mock_data.raise_for_status.side_effect = error + mock_client.return_value = mock_data + yield mock_client + + +@contextmanager +def patch_save_to_file() -> Iterator[Mock]: + """Context manager that patches json.dump to capture file saving operations.""" + with patch("json.dump") as m_json: + yield m_json + + +@contextmanager +def patch_open_file() -> Iterator[Mock]: + """Context manager that patches builtins.open to capture file opening operations.""" + with patch("builtins.open") as mock_open: + yield mock_open + + +@contextmanager +def patch_npm_ecosystem(data: dict[str, Any]) -> Iterator[None]: + """Context manager that temporarily modifies the npm ecosystem configuration for testing.""" + with ( + patch.dict( + ECOSYSTEMS, + {"npm": Ecosystem(**npm_ecosystem.__dict__ | data)}, + ), + ): + yield + + +@freeze_time("2025-01-01") +class TestDownload: + def test_pypi_download(self) -> None: + """Test downloading PyPI packages and verifying the correct API call and data saving.""" + data = { + "rows": [ + {"project": "requests", "download_count": 12345}, + {"project": "setuptools", "download_count": 8765}, + ] + } + with patch_client(data) as m_client, patch_save_to_file() as m_save, patch_open_file() as m_open: + _run("pypi") + + # Check the HTTP request with its parameters + assert m_client.call_count == 1 + assert m_client.call_args == call( + "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json", params={} + ) + + # Check the file path + assert m_open.call_args_list[0] == call(str(Path(DEPENDENCIES_DIR) / "pypi.json"), "w") + + # Check the content of the file + assert m_save.call_count == 1 + assert m_save.call_args[0][0]["date"] == "2025-01-01T00:00:00+00:00" + assert set(m_save.call_args[0][0]["packages"]) == {"setuptools", "requests"} + assert m_save.call_args[0][1] == m_open().__enter__() + + def test_npm_download(self) -> None: + """Test downloading npm packages with pagination and verifying the correct API call and data saving.""" + data = [ + {"name": "lodash", "downloads": 12345}, + {"name": "@aws/sdk", "downloads": 98765}, + ] + with ( + patch_client(data) as m_client, + patch_save_to_file() as m_save, + patch_open_file() as m_open, + patch_npm_ecosystem({"pages": 1}), + ): + _run("npm") + + # Check the HTTP request with its parameters + assert m_client.call_count == 1 + assert m_client.call_args == call( + "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages", + params={"per_page": 100, "sort": "downloads", "page": 1}, + ) + + # Check the file path + assert m_open.call_args_list[0] == call(str(Path(DEPENDENCIES_DIR) / "npm.json"), "w") + + # Check the content of the file + assert m_save.call_count == 1 + assert m_save.call_args[0][0]["date"] == "2025-01-01T00:00:00+00:00" + assert set(m_save.call_args[0][0]["packages"]) == {"@aws/sdk", "lodash"} + assert m_save.call_args[0][1] == m_open().__enter__() + + def test_invalid_ecosystem(self) -> None: + """Test that a KeyError is raised when trying to run with an invalid ecosystem.""" + with pytest.raises(KeyError): + _run("asdf") + + def test_invalid_pypi_json_format(self) -> None: + """Test that InvalidJSONError is raised when PyPI JSON data has invalid format.""" + with pytest.raises(InvalidJSONError): + parse_pypi({}) + + def test_invalid_npm_json_format(self) -> None: + """Test that InvalidJSONError is raised when npm JSON data has invalid format.""" + with pytest.raises(InvalidJSONError): + parse_npm([{"key": "val"}]) + + def test_invalid_downloaded_json(self) -> None: + """Test that InvalidJSONError is raised when downloaded JSON cannot be parsed.""" + with patch("httpx.Client.get") as mock_client: + mock_response = Mock() + mock_response.raise_for_status.return_value = None + mock_response.json.side_effect = json.JSONDecodeError("Invalid JSON", "", 0) + mock_client.return_value = mock_response + with pytest.raises(InvalidJSONError): + _run("pypi") + + def test_retry_mechanism_with_server_errors(self) -> None: + """Test that it will retry as many times as attempts defined and raise an exception afterwards.""" + mock_response = Mock() + mock_response.is_server_error = True + mock_response.status_code = 500 + + server_error = httpx.HTTPStatusError("Server Error", request=Mock(), response=mock_response) + + with ( + patch_client_error(server_error) as mock_client, + patch("scripts.download_packages.RETRY_WAIT_JITTER", 0), + patch("scripts.download_packages.RETRY_WAIT_EXP_BASE", 1), + patch("scripts.download_packages.RETRY_WAIT_MAX", 0), + ): + with pytest.raises(ServerError): + _run("pypi") + + assert mock_client.call_count == RETRY_ATTEMPTS + + def test_npm_download_with_multiple_pages(self) -> None: + """Test that the script will iterate through pages if provided.""" + page1_data = [ + {"name": "lodash", "downloads": 12345}, + {"name": "@aws/sdk", "downloads": 98765}, + ] + page2_data = [ + {"name": "react", "downloads": 54321}, + {"name": "express", "downloads": 87654}, + ] + + with ( + patch_client(None) as m_client, # We'll configure the side_effect below + patch_save_to_file() as m_save, + patch_open_file() as m_open, + patch_npm_ecosystem({"pages": 2}), + ): + # Configure the mock to return different data for each call + mock_responses = [] + for data in [page1_data, page2_data]: + mock_response = Mock() + mock_response.json.return_value = data + mock_responses.append(mock_response) + + m_client.side_effect = mock_responses + + _run("npm") + + assert m_client.call_count == 2 + + assert m_client.call_args_list == [ + call( + "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages", + params={"per_page": 100, "sort": "downloads", "page": 1}, + ), + call( + "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages", + params={"per_page": 100, "sort": "downloads", "page": 2}, + ), + ] + + # Verify that all packages from all pages were collected + assert m_save.call_count == 1 + assert m_save.call_args[0][0]["date"] == "2025-01-01T00:00:00+00:00" + assert set(m_save.call_args[0][0]["packages"]) == {"lodash", "@aws/sdk", "react", "express"} + assert m_save.call_args[0][1] == m_open().__enter__() + + +class TestCli: + def test_non_existing_ecosystem_error(self) -> None: + """Test that an error is raised when a non-existing ecosystem is introduced.""" + runner = CliRunner() + result = runner.invoke(download, ["invalid_ecosystem"]) + + assert result.exit_code != 0 + assert "Not a valid ecosystem" in result.output + + @freeze_time("2025-01-01") + def test_cli(self) -> None: + """Test the script can be run through the cli and contents are saved to file.""" + runner = CliRunner() + + data = { + "rows": [ + {"project": "requests", "download_count": 12345}, + {"project": "setuptools", "download_count": 8765}, + ] + } + with patch_client(data), patch_save_to_file() as m_save, patch_open_file() as m_open: + result = runner.invoke(download, ["pypi"]) + + assert result.exit_code == 0 + + assert m_save.call_count == 1 + assert m_save.call_args[0][0]["date"] == "2025-01-01T00:00:00+00:00" + assert set(m_save.call_args[0][0]["packages"]) == {"setuptools", "requests"} + assert m_save.call_args[0][1] == m_open().__enter__() diff --git a/justfile b/justfile index f7895be..6e4dd02 100644 --- a/justfile +++ b/justfile @@ -32,8 +32,11 @@ clean: # Runs the tests with the specified arguments (any path or pytest argument). test *test-args='': venv - {{ run }} pytest {{ test-args }} --no-cov + {{ run }} pytest tests {{ test-args }} +# Runs the "download" tests with the specified arguments (any path or pytest argument). +test-download *test-args='': venv + {{ run }} pytest dependencies/tests {{ test-args }} # Format all code in the project. format: venv