From aa347fcc8dcc620bf1664fa4d08aab44bc32fb2d Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 4 Feb 2025 14:05:21 +0200 Subject: [PATCH] feat: fetch datasets list from registry --- oc4ids_datastore_pipeline/pipeline.py | 29 +++++++++++++++++--------- tests/test_pipeline.py | 30 +++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index 6b4832f..64ce207 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -8,15 +8,23 @@ logger = logging.getLogger(__name__) -REGISTERED_DATASETS = { - "uganda_gpp": "https://gpp.ppda.go.ug/adminapi/public/api/open-data/v1/infrastructure/projects/download?format=json", # noqa: E501 - "ghana_cost_sekondi_takoradi": "https://costsekondi-takoradigh.org/uploads/projectJson.json", # noqa: E501 - "mexico_cost_jalisco": "http://www.costjalisco.org.mx/jsonprojects", - "mexico_nuevo_leon": "http://si.nl.gob.mx/siasi_ws/api/edcapi/DescargarProjectPackage", # noqa: E501 - "indonesia_cost_west_lombok": "https://intras.lombokbaratkab.go.id/oc4ids", - "ukraine_cost_ukraine": "https://portal.costukraine.org/data.json", - "malawi_cost_malawi": "https://ippi.mw/api/projects/query", -} + +def fetch_registered_datasets() -> dict[str, str]: + logger.info("Fetching registered datasets list from registry") + try: + url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/dataset/records_api.json" # noqa: E501 + r = requests.get(url) + r.raise_for_status() + json_data = r.json() + registered_datasets = { + key: value["fields"]["url"]["value"] + for (key, value) in json_data["records"].items() + } + registered_datasets_count = len(registered_datasets) + logger.info(f"Fetched URLs for {registered_datasets_count} datasets") + return registered_datasets + except Exception as e: + raise Exception("Failed to fetch datasets list from registry", e) def download_json(url: str) -> Any: @@ -66,7 +74,8 @@ def process_dataset(dataset_name: str, dataset_url: str) -> None: def process_datasets() -> None: - for name, url in REGISTERED_DATASETS.items(): + registered_datasets = fetch_registered_datasets() + for name, url in registered_datasets.items(): process_dataset(name, url) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 281c5f4..b3b2179 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,18 +1,48 @@ import os import tempfile from textwrap import dedent +from unittest.mock import MagicMock import pytest from pytest_mock import MockerFixture from oc4ids_datastore_pipeline.pipeline import ( download_json, + fetch_registered_datasets, process_dataset, validate_json, write_json_to_file, ) +def test_fetch_registered_datasets(mocker: MockerFixture) -> None: + mock_response = MagicMock() + mock_response.json.return_value = { + "records": { + "test_dataset": {"fields": {"url": {"value": "https://test_dataset.json"}}} + } + } + patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") + patch_get.return_value = mock_response + + result = fetch_registered_datasets() + + assert result == {"test_dataset": "https://test_dataset.json"} + + +def test_fetch_registered_datasets_raises_failure_exception( + mocker: MockerFixture, +) -> None: + patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") + patch_get.side_effect = Exception("Mocked exception") + + with pytest.raises(Exception) as exc_info: + fetch_registered_datasets() + + assert "Failed to fetch datasets list from registry" in str(exc_info.value) + assert "Mocked exception" in str(exc_info.value) + + def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None: patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") patch_get.side_effect = Exception("Mocked exception")