Skip to content

Commit 38fc1e6

Browse files
Merge pull request #14 from OpenDataServices/2-file-ingestion
Initial pipeline implementation
2 parents 059d9d7 + 2277eb8 commit 38fc1e6

File tree

6 files changed

+324
-7
lines changed

6 files changed

+324
-7
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ jobs:
1111
python-version: "3.12"
1212
- name: Install dev requirements
1313
run: pip install -r requirements_dev.txt
14+
- name: Install local package
15+
run: pip install .
1416
- name: Check black
1517
run: black --check oc4ids_datastore_pipeline/ tests/
1618
- name: Check isort

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
.venv
22
__pycache__
3+
4+
data/
Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,74 @@
1+
import json
12
import logging
3+
import os
4+
from typing import Any
5+
6+
import requests
7+
from libcoveoc4ids.api import oc4ids_json_output
28

39
logger = logging.getLogger(__name__)
410

11+
REGISTERED_DATASETS = {
12+
"uganda_gpp": "https://gpp.ppda.go.ug/adminapi/public/api/open-data/v1/infrastructure/projects/download?format=json", # noqa: E501
13+
"ghana_cost_sekondi_takoradi": "https://costsekondi-takoradigh.org/uploads/projectJson.json", # noqa: E501
14+
"mexico_cost_jalisco": "http://www.costjalisco.org.mx/jsonprojects",
15+
"mexico_nuevo_leon": "http://si.nl.gob.mx/siasi_ws/api/edcapi/DescargarProjectPackage", # noqa: E501
16+
"indonesia_cost_west_lombok": "https://intras.lombokbaratkab.go.id/oc4ids",
17+
"ukraine_cost_ukraine": "https://portal.costukraine.org/data.json",
18+
"malawi_cost_malawi": "https://ippi.mw/api/projects/query",
19+
}
20+
21+
22+
def download_json(url: str) -> Any:
23+
logger.info(f"Downloading json from {url}")
24+
try:
25+
r = requests.get(url)
26+
r.raise_for_status()
27+
response_size = len(r.content)
28+
logger.info(f"Downloaded {url} ({response_size} bytes)")
29+
return r.json()
30+
except Exception as e:
31+
raise Exception("Download failed", e)
32+
33+
34+
def validate_json(dataset_name: str, json_data: Any) -> None:
35+
logger.info(f"Validating dataset {dataset_name}")
36+
try:
37+
validation_result = oc4ids_json_output(json_data=json_data)
38+
validation_errors_count = validation_result["validation_errors_count"]
39+
if validation_errors_count > 0:
40+
raise Exception(f"Dataset has {validation_errors_count} validation errors")
41+
logger.info(f"Dataset {dataset_name} is valid")
42+
except Exception as e:
43+
raise Exception("Validation failed", e)
44+
45+
46+
def write_json_to_file(file_name: str, json_data: Any) -> None:
47+
logger.info(f"Writing dataset to file {file_name}")
48+
try:
49+
os.makedirs(os.path.dirname(file_name), exist_ok=True)
50+
with open(file_name, "w") as file:
51+
json.dump(json_data, file, indent=4)
52+
logger.info(f"Finished writing to {file_name}")
53+
except Exception as e:
54+
raise Exception("Error while writing to JSON file", e)
55+
56+
57+
def process_dataset(dataset_name: str, dataset_url: str) -> None:
58+
logger.info(f"Processing dataset {dataset_name}")
59+
try:
60+
json_data = download_json(dataset_url)
61+
validate_json(dataset_name, json_data)
62+
write_json_to_file(f"data/{dataset_name}.json", json_data)
63+
logger.info(f"Processed dataset {dataset_name}")
64+
except Exception as e:
65+
logger.warning(f"Failed to process dataset {dataset_name} with error {e}")
66+
67+
68+
def process_datasets() -> None:
69+
for name, url in REGISTERED_DATASETS.items():
70+
process_dataset(name, url)
71+
572

673
def run() -> None:
7-
logger.info("Hello World!")
74+
process_datasets()

pyproject.toml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@ name = "oc4ids-datastore-pipeline"
77
description = "OC4IDS Datastore Pipeline"
88
version = "0.1.0"
99
readme = "README.md"
10-
dependencies = []
10+
dependencies = [
11+
"libcoveoc4ids",
12+
"requests",
13+
]
1114

1215
[project.optional-dependencies]
1316
dev = [
@@ -17,6 +20,8 @@ dev = [
1720
"Flake8-pyproject",
1821
"mypy",
1922
"pytest",
23+
"pytest-mock",
24+
"types-requests",
2025
]
2126

2227
[project.scripts]
@@ -30,3 +35,11 @@ max-line-length = 88
3035

3136
[tool.mypy]
3237
strict = true
38+
39+
[[tool.mypy.overrides]]
40+
module = ["libcoveoc4ids.*"]
41+
follow_untyped_imports = true
42+
43+
[tool.pytest.ini_options]
44+
log_cli = true
45+
log_cli_level = "INFO"

requirements_dev.txt

Lines changed: 149 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,75 @@
44
#
55
# pip-compile --extra=dev --output-file=requirements_dev.txt pyproject.toml
66
#
7+
attrs==25.1.0
8+
# via
9+
# cattrs
10+
# jsonschema
11+
# referencing
12+
# requests-cache
13+
backports-datetime-fromisoformat==2.0.3
14+
# via flattentool
715
black==25.1.0
816
# via oc4ids-datastore-pipeline (pyproject.toml)
17+
btrees==6.1
18+
# via zodb
19+
cattrs==24.1.2
20+
# via requests-cache
21+
certifi==2025.1.31
22+
# via requests
23+
cffi==1.17.1
24+
# via persistent
25+
charset-normalizer==3.4.1
26+
# via requests
927
click==8.1.8
10-
# via black
28+
# via
29+
# black
30+
# libcoveoc4ids
31+
# libcoveocds
32+
defusedxml==0.7.1
33+
# via odfpy
34+
et-xmlfile==2.0.0
35+
# via openpyxl
1136
flake8==7.1.1
1237
# via
1338
# flake8-pyproject
1439
# oc4ids-datastore-pipeline (pyproject.toml)
1540
flake8-pyproject==1.2.3
1641
# via oc4ids-datastore-pipeline (pyproject.toml)
42+
flattentool==0.27.0
43+
# via libcove
44+
idna==3.10
45+
# via requests
46+
ijson==3.3.0
47+
# via flattentool
1748
iniconfig==2.0.0
1849
# via pytest
1950
isort==6.0.0
2051
# via oc4ids-datastore-pipeline (pyproject.toml)
52+
json-merge-patch==0.2
53+
# via ocdsextensionregistry
54+
jsonref==1.1.0
55+
# via
56+
# flattentool
57+
# libcove
58+
# libcoveocds
59+
# ocdsextensionregistry
60+
jsonschema==4.23.0
61+
# via
62+
# libcove
63+
# libcoveocds
64+
jsonschema-specifications==2024.10.1
65+
# via jsonschema
66+
libcove==0.32.1
67+
# via
68+
# libcoveoc4ids
69+
# libcoveocds
70+
libcoveoc4ids==0.9.0
71+
# via oc4ids-datastore-pipeline (pyproject.toml)
72+
libcoveocds==0.16.4
73+
# via libcoveoc4ids
74+
lxml==5.3.0
75+
# via flattentool
2176
mccabe==0.7.0
2277
# via flake8
2378
mypy==1.14.1
@@ -26,21 +81,112 @@ mypy-extensions==1.0.0
2681
# via
2782
# black
2883
# mypy
84+
ocdsextensionregistry==0.6.9
85+
# via libcoveocds
86+
odfpy==1.4.1
87+
# via flattentool
88+
openpyxl==3.1.5
89+
# via flattentool
2990
packaging==24.2
3091
# via
3192
# black
3293
# pytest
3394
pathspec==0.12.1
3495
# via black
96+
persistent==6.1
97+
# via
98+
# btrees
99+
# zodb
35100
platformdirs==4.3.6
36-
# via black
101+
# via
102+
# black
103+
# requests-cache
37104
pluggy==1.5.0
38105
# via pytest
39106
pycodestyle==2.12.1
40107
# via flake8
108+
pycparser==2.22
109+
# via cffi
41110
pyflakes==3.2.0
42111
# via flake8
43112
pytest==8.3.4
113+
# via
114+
# oc4ids-datastore-pipeline (pyproject.toml)
115+
# pytest-mock
116+
pytest-mock==3.14.0
117+
# via oc4ids-datastore-pipeline (pyproject.toml)
118+
pytz==2025.1
119+
# via flattentool
120+
referencing==0.36.2
121+
# via
122+
# jsonschema
123+
# jsonschema-specifications
124+
# libcove
125+
# libcoveocds
126+
requests==2.32.3
127+
# via
128+
# libcove
129+
# libcoveocds
130+
# oc4ids-datastore-pipeline (pyproject.toml)
131+
# ocdsextensionregistry
132+
# requests-cache
133+
requests-cache==1.2.1
134+
# via ocdsextensionregistry
135+
rfc3339-validator==0.1.4
136+
# via libcove
137+
rfc3987==1.3.8
138+
# via libcove
139+
rpds-py==0.22.3
140+
# via
141+
# jsonschema
142+
# referencing
143+
schema==0.7.7
144+
# via flattentool
145+
six==1.17.0
146+
# via
147+
# rfc3339-validator
148+
# url-normalize
149+
transaction==5.0
150+
# via zodb
151+
types-requests==2.32.0.20241016
44152
# via oc4ids-datastore-pipeline (pyproject.toml)
45153
typing-extensions==4.12.2
46-
# via mypy
154+
# via
155+
# mypy
156+
# referencing
157+
url-normalize==1.4.3
158+
# via requests-cache
159+
urllib3==2.3.0
160+
# via
161+
# requests
162+
# requests-cache
163+
# types-requests
164+
xmltodict==0.14.2
165+
# via flattentool
166+
zc-lockfile==3.0.post1
167+
# via zodb
168+
zc-zlibstorage==1.2.0
169+
# via flattentool
170+
zconfig==4.2
171+
# via zodb
172+
zodb==6.0
173+
# via
174+
# flattentool
175+
# zc-zlibstorage
176+
zodbpickle==4.1.1
177+
# via zodb
178+
zope-deferredimport==5.0
179+
# via persistent
180+
zope-interface==7.2
181+
# via
182+
# btrees
183+
# persistent
184+
# transaction
185+
# zc-zlibstorage
186+
# zodb
187+
# zope-proxy
188+
zope-proxy==6.1
189+
# via zope-deferredimport
190+
191+
# The following packages are considered to be unsafe in a requirements file:
192+
# setuptools

0 commit comments

Comments
 (0)