From 3a192bd855428fe553e8a9c4f09fe771d8e4f90f Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 4 Feb 2025 13:08:29 +0200 Subject: [PATCH 1/4] ci: install local package before tests --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6beb13e..019e378 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,6 +11,8 @@ jobs: python-version: "3.12" - name: Install dev requirements run: pip install -r requirements_dev.txt + - name: Install local package + run: pip install . - name: Check black run: black --check oc4ids_datastore_pipeline/ tests/ - name: Check isort From 81a6b7f4a230fd66c3522b9b199667e0fa1cbbe4 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 4 Feb 2025 11:34:09 +0200 Subject: [PATCH 2/4] feat: download datasets --- oc4ids_datastore_pipeline/pipeline.py | 40 ++++++++++++++++++++++++++- pyproject.toml | 10 ++++++- requirements_dev.txt | 18 ++++++++++++ tests/test_pipeline.py | 26 +++++++++++++++-- 4 files changed, 90 insertions(+), 4 deletions(-) diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index 56081a4..1ce6189 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -1,7 +1,45 @@ import logging +from typing import Any + +import requests logger = logging.getLogger(__name__) +REGISTERED_DATASETS = { + "uganda_gpp": "https://gpp.ppda.go.ug/adminapi/public/api/open-data/v1/infrastructure/projects/download?format=json", # noqa: E501 + "ghana_cost_sekondi_takoradi": "https://costsekondi-takoradigh.org/uploads/projectJson.json", # noqa: E501 + "mexico_cost_jalisco": "http://www.costjalisco.org.mx/jsonprojects", + "mexico_nuevo_leon": "http://si.nl.gob.mx/siasi_ws/api/edcapi/DescargarProjectPackage", # noqa: E501 + "indonesia_cost_west_lombok": "https://intras.lombokbaratkab.go.id/oc4ids", + "ukraine_cost_ukraine": "https://portal.costukraine.org/data.json", + "malawi_cost_malawi": "https://ippi.mw/api/projects/query", +} + + +def download_json(url: str) -> Any: + logger.info(f"Downloading json from {url}") + try: + r = requests.get(url) + r.raise_for_status() + response_size = len(r.content) + logger.info(f"Downloaded {url} ({response_size} bytes)") + return r.json() + except Exception as e: + raise Exception("Download failed", e) + + +def process_dataset(dataset_name: str, dataset_url: str) -> None: + logger.info(f"Processing dataset {dataset_name}") + try: + download_json(dataset_url) + except Exception as e: + logger.warning(f"Failed to process dataset {dataset_name} with error {e}") + + +def process_datasets() -> None: + for name, url in REGISTERED_DATASETS.items(): + process_dataset(name, url) + def run() -> None: - logger.info("Hello World!") + process_datasets() diff --git a/pyproject.toml b/pyproject.toml index 603c4ba..96ebb3d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,9 @@ name = "oc4ids-datastore-pipeline" description = "OC4IDS Datastore Pipeline" version = "0.1.0" readme = "README.md" -dependencies = [] +dependencies = [ + "requests" +] [project.optional-dependencies] dev = [ @@ -17,6 +19,8 @@ dev = [ "Flake8-pyproject", "mypy", "pytest", + "pytest-mock", + "types-requests", ] [project.scripts] @@ -30,3 +34,7 @@ max-line-length = 88 [tool.mypy] strict = true + +[tool.pytest.ini_options] +log_cli = true +log_cli_level = "INFO" diff --git a/requirements_dev.txt b/requirements_dev.txt index b1ce205..1102839 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -6,6 +6,10 @@ # black==25.1.0 # via oc4ids-datastore-pipeline (pyproject.toml) +certifi==2025.1.31 + # via requests +charset-normalizer==3.4.1 + # via requests click==8.1.8 # via black flake8==7.1.1 @@ -14,6 +18,8 @@ flake8==7.1.1 # oc4ids-datastore-pipeline (pyproject.toml) flake8-pyproject==1.2.3 # via oc4ids-datastore-pipeline (pyproject.toml) +idna==3.10 + # via requests iniconfig==2.0.0 # via pytest isort==6.0.0 @@ -41,6 +47,18 @@ pycodestyle==2.12.1 pyflakes==3.2.0 # via flake8 pytest==8.3.4 + # via + # oc4ids-datastore-pipeline (pyproject.toml) + # pytest-mock +pytest-mock==3.14.0 + # via oc4ids-datastore-pipeline (pyproject.toml) +requests==2.32.3 + # via oc4ids-datastore-pipeline (pyproject.toml) +types-requests==2.32.0.20241016 # via oc4ids-datastore-pipeline (pyproject.toml) typing-extensions==4.12.2 # via mypy +urllib3==2.3.0 + # via + # requests + # types-requests diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index af8ff9f..955f6cc 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,2 +1,24 @@ -def test_hello_world() -> None: - pass +import pytest +from pytest_mock import MockerFixture + +from oc4ids_datastore_pipeline.pipeline import download_json, process_dataset + + +def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None: + patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") + patch_get.side_effect = Exception("Mocked exception") + + with pytest.raises(Exception) as exc_info: + download_json(url="https://test_dataset.json") + + assert "Download failed" in str(exc_info.value) + assert "Mocked exception" in str(exc_info.value) + + +def test_process_dataset_catches_exception(mocker: MockerFixture) -> None: + patch_download_json = mocker.patch( + "oc4ids_datastore_pipeline.pipeline.download_json" + ) + patch_download_json.side_effect = Exception("Download failed") + + process_dataset("test_dataset", "https://test_dataset.json") From 74bf6f8a72832a049a79f6746d0d464817f916da Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 4 Feb 2025 12:01:12 +0200 Subject: [PATCH 3/4] feat: validate datasets --- oc4ids_datastore_pipeline/pipeline.py | 17 +++- pyproject.toml | 7 +- requirements_dev.txt | 136 +++++++++++++++++++++++++- tests/test_pipeline.py | 34 ++++++- 4 files changed, 187 insertions(+), 7 deletions(-) diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index 1ce6189..4d87911 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -2,6 +2,7 @@ from typing import Any import requests +from libcoveoc4ids.api import oc4ids_json_output logger = logging.getLogger(__name__) @@ -28,10 +29,24 @@ def download_json(url: str) -> Any: raise Exception("Download failed", e) +def validate_json(dataset_name: str, json_data: Any) -> None: + logger.info(f"Validating dataset {dataset_name}") + try: + validation_result = oc4ids_json_output(json_data=json_data) + validation_errors_count = validation_result["validation_errors_count"] + if validation_errors_count > 0: + raise Exception(f"Dataset has {validation_errors_count} validation errors") + logger.info(f"Dataset {dataset_name} is valid") + except Exception as e: + raise Exception("Validation failed", e) + + def process_dataset(dataset_name: str, dataset_url: str) -> None: logger.info(f"Processing dataset {dataset_name}") try: - download_json(dataset_url) + json_data = download_json(dataset_url) + validate_json(dataset_name, json_data) + logger.info(f"Processed dataset {dataset_name}") except Exception as e: logger.warning(f"Failed to process dataset {dataset_name} with error {e}") diff --git a/pyproject.toml b/pyproject.toml index 96ebb3d..a12b620 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,8 @@ description = "OC4IDS Datastore Pipeline" version = "0.1.0" readme = "README.md" dependencies = [ - "requests" + "libcoveoc4ids", + "requests", ] [project.optional-dependencies] @@ -35,6 +36,10 @@ max-line-length = 88 [tool.mypy] strict = true +[[tool.mypy.overrides]] +module = ["libcoveoc4ids.*"] +follow_untyped_imports = true + [tool.pytest.ini_options] log_cli = true log_cli_level = "INFO" diff --git a/requirements_dev.txt b/requirements_dev.txt index 1102839..c9d8620 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -4,26 +4,75 @@ # # pip-compile --extra=dev --output-file=requirements_dev.txt pyproject.toml # +attrs==25.1.0 + # via + # cattrs + # jsonschema + # referencing + # requests-cache +backports-datetime-fromisoformat==2.0.3 + # via flattentool black==25.1.0 # via oc4ids-datastore-pipeline (pyproject.toml) +btrees==6.1 + # via zodb +cattrs==24.1.2 + # via requests-cache certifi==2025.1.31 # via requests +cffi==1.17.1 + # via persistent charset-normalizer==3.4.1 # via requests click==8.1.8 - # via black + # via + # black + # libcoveoc4ids + # libcoveocds +defusedxml==0.7.1 + # via odfpy +et-xmlfile==2.0.0 + # via openpyxl flake8==7.1.1 # via # flake8-pyproject # oc4ids-datastore-pipeline (pyproject.toml) flake8-pyproject==1.2.3 # via oc4ids-datastore-pipeline (pyproject.toml) +flattentool==0.27.0 + # via libcove idna==3.10 # via requests +ijson==3.3.0 + # via flattentool iniconfig==2.0.0 # via pytest isort==6.0.0 # via oc4ids-datastore-pipeline (pyproject.toml) +json-merge-patch==0.2 + # via ocdsextensionregistry +jsonref==1.1.0 + # via + # flattentool + # libcove + # libcoveocds + # ocdsextensionregistry +jsonschema==4.23.0 + # via + # libcove + # libcoveocds +jsonschema-specifications==2024.10.1 + # via jsonschema +libcove==0.32.1 + # via + # libcoveoc4ids + # libcoveocds +libcoveoc4ids==0.9.0 + # via oc4ids-datastore-pipeline (pyproject.toml) +libcoveocds==0.16.4 + # via libcoveoc4ids +lxml==5.3.0 + # via flattentool mccabe==0.7.0 # via flake8 mypy==1.14.1 @@ -32,18 +81,32 @@ mypy-extensions==1.0.0 # via # black # mypy +ocdsextensionregistry==0.6.9 + # via libcoveocds +odfpy==1.4.1 + # via flattentool +openpyxl==3.1.5 + # via flattentool packaging==24.2 # via # black # pytest pathspec==0.12.1 # via black +persistent==6.1 + # via + # btrees + # zodb platformdirs==4.3.6 - # via black + # via + # black + # requests-cache pluggy==1.5.0 # via pytest pycodestyle==2.12.1 # via flake8 +pycparser==2.22 + # via cffi pyflakes==3.2.0 # via flake8 pytest==8.3.4 @@ -52,13 +115,78 @@ pytest==8.3.4 # pytest-mock pytest-mock==3.14.0 # via oc4ids-datastore-pipeline (pyproject.toml) +pytz==2025.1 + # via flattentool +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications + # libcove + # libcoveocds requests==2.32.3 - # via oc4ids-datastore-pipeline (pyproject.toml) + # via + # libcove + # libcoveocds + # oc4ids-datastore-pipeline (pyproject.toml) + # ocdsextensionregistry + # requests-cache +requests-cache==1.2.1 + # via ocdsextensionregistry +rfc3339-validator==0.1.4 + # via libcove +rfc3987==1.3.8 + # via libcove +rpds-py==0.22.3 + # via + # jsonschema + # referencing +schema==0.7.7 + # via flattentool +six==1.17.0 + # via + # rfc3339-validator + # url-normalize +transaction==5.0 + # via zodb types-requests==2.32.0.20241016 # via oc4ids-datastore-pipeline (pyproject.toml) typing-extensions==4.12.2 - # via mypy + # via + # mypy + # referencing +url-normalize==1.4.3 + # via requests-cache urllib3==2.3.0 # via # requests + # requests-cache # types-requests +xmltodict==0.14.2 + # via flattentool +zc-lockfile==3.0.post1 + # via zodb +zc-zlibstorage==1.2.0 + # via flattentool +zconfig==4.2 + # via zodb +zodb==6.0 + # via + # flattentool + # zc-zlibstorage +zodbpickle==4.1.1 + # via zodb +zope-deferredimport==5.0 + # via persistent +zope-interface==7.2 + # via + # btrees + # persistent + # transaction + # zc-zlibstorage + # zodb + # zope-proxy +zope-proxy==6.1 + # via zope-deferredimport + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 955f6cc..20d8575 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,7 +1,11 @@ import pytest from pytest_mock import MockerFixture -from oc4ids_datastore_pipeline.pipeline import download_json, process_dataset +from oc4ids_datastore_pipeline.pipeline import ( + download_json, + process_dataset, + validate_json, +) def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None: @@ -15,6 +19,34 @@ def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None: assert "Mocked exception" in str(exc_info.value) +def test_validate_json_raises_failure_exception(mocker: MockerFixture) -> None: + patch_oc4ids_json_output = mocker.patch( + "oc4ids_datastore_pipeline.pipeline.oc4ids_json_output" + ) + patch_oc4ids_json_output.side_effect = Exception("Mocked exception") + + with pytest.raises(Exception) as exc_info: + validate_json(dataset_name="test_dataset", json_data={}) + + assert "Validation failed" in str(exc_info.value) + assert "Mocked exception" in str(exc_info.value) + + +def test_validate_json_raises_validation_errors_exception( + mocker: MockerFixture, +) -> None: + patch_oc4ids_json_output = mocker.patch( + "oc4ids_datastore_pipeline.pipeline.oc4ids_json_output" + ) + patch_oc4ids_json_output.return_value = {"validation_errors_count": 2} + + with pytest.raises(Exception) as exc_info: + validate_json(dataset_name="test_dataset", json_data={}) + + assert "Validation failed" in str(exc_info.value) + assert "Dataset has 2 validation errors" in str(exc_info.value) + + def test_process_dataset_catches_exception(mocker: MockerFixture) -> None: patch_download_json = mocker.patch( "oc4ids_datastore_pipeline.pipeline.download_json" From 2277eb88a78c95c22f4e4bcf6247bb4478cb35d1 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 4 Feb 2025 12:59:52 +0200 Subject: [PATCH 4/4] feat: write json to file --- .gitignore | 2 ++ oc4ids_datastore_pipeline/pipeline.py | 14 ++++++++++++ tests/test_pipeline.py | 33 +++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/.gitignore b/.gitignore index 033df5f..126d2cd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .venv __pycache__ + +data/ diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index 4d87911..6b4832f 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -1,4 +1,6 @@ +import json import logging +import os from typing import Any import requests @@ -41,11 +43,23 @@ def validate_json(dataset_name: str, json_data: Any) -> None: raise Exception("Validation failed", e) +def write_json_to_file(file_name: str, json_data: Any) -> None: + logger.info(f"Writing dataset to file {file_name}") + try: + os.makedirs(os.path.dirname(file_name), exist_ok=True) + with open(file_name, "w") as file: + json.dump(json_data, file, indent=4) + logger.info(f"Finished writing to {file_name}") + except Exception as e: + raise Exception("Error while writing to JSON file", e) + + def process_dataset(dataset_name: str, dataset_url: str) -> None: logger.info(f"Processing dataset {dataset_name}") try: json_data = download_json(dataset_url) validate_json(dataset_name, json_data) + write_json_to_file(f"data/{dataset_name}.json", json_data) logger.info(f"Processed dataset {dataset_name}") except Exception as e: logger.warning(f"Failed to process dataset {dataset_name} with error {e}") diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 20d8575..281c5f4 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,3 +1,7 @@ +import os +import tempfile +from textwrap import dedent + import pytest from pytest_mock import MockerFixture @@ -5,6 +9,7 @@ download_json, process_dataset, validate_json, + write_json_to_file, ) @@ -47,6 +52,34 @@ def test_validate_json_raises_validation_errors_exception( assert "Dataset has 2 validation errors" in str(exc_info.value) +def test_write_json_to_file_writes_in_correct_format() -> None: + with tempfile.TemporaryDirectory() as dir: + file_name = os.path.join(dir, "test_dataset.json") + write_json_to_file(file_name=file_name, json_data={"key": "value"}) + + expected = dedent( + """\ + { + "key": "value" + }""" + ) + with open(file_name) as file: + assert file.read() == expected + + +def test_write_json_to_file_raises_failure_exception(mocker: MockerFixture) -> None: + patch_json_dump = mocker.patch("oc4ids_datastore_pipeline.pipeline.json.dump") + patch_json_dump.side_effect = Exception("Mocked exception") + + with pytest.raises(Exception) as exc_info: + with tempfile.TemporaryDirectory() as dir: + file_name = os.path.join(dir, "test_dataset.json") + write_json_to_file(file_name=file_name, json_data={"key": "value"}) + + assert "Error while writing to JSON file" in str(exc_info.value) + assert "Mocked exception" in str(exc_info.value) + + def test_process_dataset_catches_exception(mocker: MockerFixture) -> None: patch_download_json = mocker.patch( "oc4ids_datastore_pipeline.pipeline.download_json"