Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 19 additions & 10 deletions oc4ids_datastore_pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,23 @@

logger = logging.getLogger(__name__)

REGISTERED_DATASETS = {
"uganda_gpp": "https://gpp.ppda.go.ug/adminapi/public/api/open-data/v1/infrastructure/projects/download?format=json", # noqa: E501
"ghana_cost_sekondi_takoradi": "https://costsekondi-takoradigh.org/uploads/projectJson.json", # noqa: E501
"mexico_cost_jalisco": "http://www.costjalisco.org.mx/jsonprojects",
"mexico_nuevo_leon": "http://si.nl.gob.mx/siasi_ws/api/edcapi/DescargarProjectPackage", # noqa: E501
"indonesia_cost_west_lombok": "https://intras.lombokbaratkab.go.id/oc4ids",
"ukraine_cost_ukraine": "https://portal.costukraine.org/data.json",
"malawi_cost_malawi": "https://ippi.mw/api/projects/query",
}

def fetch_registered_datasets() -> dict[str, str]:
logger.info("Fetching registered datasets list from registry")
try:
url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/dataset/records_api.json" # noqa: E501
r = requests.get(url)
r.raise_for_status()
json_data = r.json()
registered_datasets = {
key: value["fields"]["url"]["value"]
for (key, value) in json_data["records"].items()
}
registered_datasets_count = len(registered_datasets)
logger.info(f"Fetched URLs for {registered_datasets_count} datasets")
return registered_datasets
except Exception as e:
raise Exception("Failed to fetch datasets list from registry", e)


def download_json(url: str) -> Any:
Expand Down Expand Up @@ -66,7 +74,8 @@ def process_dataset(dataset_name: str, dataset_url: str) -> None:


def process_datasets() -> None:
for name, url in REGISTERED_DATASETS.items():
registered_datasets = fetch_registered_datasets()
for name, url in registered_datasets.items():
process_dataset(name, url)


Expand Down
30 changes: 30 additions & 0 deletions tests/test_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,48 @@
import os
import tempfile
from textwrap import dedent
from unittest.mock import MagicMock

import pytest
from pytest_mock import MockerFixture

from oc4ids_datastore_pipeline.pipeline import (
download_json,
fetch_registered_datasets,
process_dataset,
validate_json,
write_json_to_file,
)


def test_fetch_registered_datasets(mocker: MockerFixture) -> None:
mock_response = MagicMock()
mock_response.json.return_value = {
"records": {
"test_dataset": {"fields": {"url": {"value": "https://test_dataset.json"}}}
}
}
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
patch_get.return_value = mock_response

result = fetch_registered_datasets()

assert result == {"test_dataset": "https://test_dataset.json"}


def test_fetch_registered_datasets_raises_failure_exception(
mocker: MockerFixture,
) -> None:
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
patch_get.side_effect = Exception("Mocked exception")

with pytest.raises(Exception) as exc_info:
fetch_registered_datasets()

assert "Failed to fetch datasets list from registry" in str(exc_info.value)
assert "Mocked exception" in str(exc_info.value)


def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None:
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
patch_get.side_effect = Exception("Mocked exception")
Expand Down