Skip to content

Commit aa347fc

Browse files
feat: fetch datasets list from registry
1 parent 2277eb8 commit aa347fc

File tree

2 files changed

+49
-10
lines changed

2 files changed

+49
-10
lines changed

oc4ids_datastore_pipeline/pipeline.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,23 @@
88

99
logger = logging.getLogger(__name__)
1010

11-
REGISTERED_DATASETS = {
12-
"uganda_gpp": "https://gpp.ppda.go.ug/adminapi/public/api/open-data/v1/infrastructure/projects/download?format=json", # noqa: E501
13-
"ghana_cost_sekondi_takoradi": "https://costsekondi-takoradigh.org/uploads/projectJson.json", # noqa: E501
14-
"mexico_cost_jalisco": "http://www.costjalisco.org.mx/jsonprojects",
15-
"mexico_nuevo_leon": "http://si.nl.gob.mx/siasi_ws/api/edcapi/DescargarProjectPackage", # noqa: E501
16-
"indonesia_cost_west_lombok": "https://intras.lombokbaratkab.go.id/oc4ids",
17-
"ukraine_cost_ukraine": "https://portal.costukraine.org/data.json",
18-
"malawi_cost_malawi": "https://ippi.mw/api/projects/query",
19-
}
11+
12+
def fetch_registered_datasets() -> dict[str, str]:
13+
logger.info("Fetching registered datasets list from registry")
14+
try:
15+
url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/dataset/records_api.json" # noqa: E501
16+
r = requests.get(url)
17+
r.raise_for_status()
18+
json_data = r.json()
19+
registered_datasets = {
20+
key: value["fields"]["url"]["value"]
21+
for (key, value) in json_data["records"].items()
22+
}
23+
registered_datasets_count = len(registered_datasets)
24+
logger.info(f"Fetched URLs for {registered_datasets_count} datasets")
25+
return registered_datasets
26+
except Exception as e:
27+
raise Exception("Failed to fetch datasets list from registry", e)
2028

2129

2230
def download_json(url: str) -> Any:
@@ -66,7 +74,8 @@ def process_dataset(dataset_name: str, dataset_url: str) -> None:
6674

6775

6876
def process_datasets() -> None:
69-
for name, url in REGISTERED_DATASETS.items():
77+
registered_datasets = fetch_registered_datasets()
78+
for name, url in registered_datasets.items():
7079
process_dataset(name, url)
7180

7281

tests/test_pipeline.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,48 @@
11
import os
22
import tempfile
33
from textwrap import dedent
4+
from unittest.mock import MagicMock
45

56
import pytest
67
from pytest_mock import MockerFixture
78

89
from oc4ids_datastore_pipeline.pipeline import (
910
download_json,
11+
fetch_registered_datasets,
1012
process_dataset,
1113
validate_json,
1214
write_json_to_file,
1315
)
1416

1517

18+
def test_fetch_registered_datasets(mocker: MockerFixture) -> None:
19+
mock_response = MagicMock()
20+
mock_response.json.return_value = {
21+
"records": {
22+
"test_dataset": {"fields": {"url": {"value": "https://test_dataset.json"}}}
23+
}
24+
}
25+
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
26+
patch_get.return_value = mock_response
27+
28+
result = fetch_registered_datasets()
29+
30+
assert result == {"test_dataset": "https://test_dataset.json"}
31+
32+
33+
def test_fetch_registered_datasets_raises_failure_exception(
34+
mocker: MockerFixture,
35+
) -> None:
36+
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
37+
patch_get.side_effect = Exception("Mocked exception")
38+
39+
with pytest.raises(Exception) as exc_info:
40+
fetch_registered_datasets()
41+
42+
assert "Failed to fetch datasets list from registry" in str(exc_info.value)
43+
assert "Mocked exception" in str(exc_info.value)
44+
45+
1646
def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None:
1747
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
1848
patch_get.side_effect = Exception("Mocked exception")

0 commit comments

Comments
 (0)