Skip to content

Commit 4f8e1ce

Browse files
feat: validate datasets
1 parent a358860 commit 4f8e1ce

File tree

4 files changed

+187
-7
lines changed

4 files changed

+187
-7
lines changed

oc4ids_datastore_pipeline/pipeline.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import Any
33

44
import requests
5+
from libcoveoc4ids.api import oc4ids_json_output
56

67
logger = logging.getLogger(__name__)
78

@@ -28,10 +29,24 @@ def download_json(url: str) -> Any:
2829
raise Exception("Download failed", e)
2930

3031

32+
def validate_json(dataset_name: str, json_data: Any) -> None:
33+
logger.info(f"Validating dataset {dataset_name}")
34+
try:
35+
validation_result = oc4ids_json_output(json_data=json_data)
36+
validation_errors_count = validation_result["validation_errors_count"]
37+
if validation_errors_count > 0:
38+
raise Exception(f"Dataset has {validation_errors_count} validation errors")
39+
logger.info(f"Dataset {dataset_name} is valid")
40+
except Exception as e:
41+
raise Exception("Validation failed", e)
42+
43+
3144
def process_dataset(dataset_name: str, dataset_url: str) -> None:
3245
logger.info(f"Processing dataset {dataset_name}")
3346
try:
34-
download_json(dataset_url)
47+
json_data = download_json(dataset_url)
48+
validate_json(dataset_name, json_data)
49+
logger.info(f"Processed dataset {dataset_name}")
3550
except Exception as e:
3651
logger.warning(f"Failed to process dataset {dataset_name} with error {e}")
3752

pyproject.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ description = "OC4IDS Datastore Pipeline"
88
version = "0.1.0"
99
readme = "README.md"
1010
dependencies = [
11-
"requests"
11+
"libcoveoc4ids",
12+
"requests",
1213
]
1314

1415
[project.optional-dependencies]
@@ -35,6 +36,10 @@ max-line-length = 88
3536
[tool.mypy]
3637
strict = true
3738

39+
[[tool.mypy.overrides]]
40+
module = ["libcoveoc4ids.*"]
41+
follow_untyped_imports = true
42+
3843
[tool.pytest.ini_options]
3944
log_cli = true
4045
log_cli_level = "INFO"

requirements_dev.txt

Lines changed: 132 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,75 @@
44
#
55
# pip-compile --extra=dev --output-file=requirements_dev.txt pyproject.toml
66
#
7+
attrs==25.1.0
8+
# via
9+
# cattrs
10+
# jsonschema
11+
# referencing
12+
# requests-cache
13+
backports-datetime-fromisoformat==2.0.3
14+
# via flattentool
715
black==25.1.0
816
# via oc4ids-datastore-pipeline (pyproject.toml)
17+
btrees==6.1
18+
# via zodb
19+
cattrs==24.1.2
20+
# via requests-cache
921
certifi==2025.1.31
1022
# via requests
23+
cffi==1.17.1
24+
# via persistent
1125
charset-normalizer==3.4.1
1226
# via requests
1327
click==8.1.8
14-
# via black
28+
# via
29+
# black
30+
# libcoveoc4ids
31+
# libcoveocds
32+
defusedxml==0.7.1
33+
# via odfpy
34+
et-xmlfile==2.0.0
35+
# via openpyxl
1536
flake8==7.1.1
1637
# via
1738
# flake8-pyproject
1839
# oc4ids-datastore-pipeline (pyproject.toml)
1940
flake8-pyproject==1.2.3
2041
# via oc4ids-datastore-pipeline (pyproject.toml)
42+
flattentool==0.27.0
43+
# via libcove
2144
idna==3.10
2245
# via requests
46+
ijson==3.3.0
47+
# via flattentool
2348
iniconfig==2.0.0
2449
# via pytest
2550
isort==6.0.0
2651
# via oc4ids-datastore-pipeline (pyproject.toml)
52+
json-merge-patch==0.2
53+
# via ocdsextensionregistry
54+
jsonref==1.1.0
55+
# via
56+
# flattentool
57+
# libcove
58+
# libcoveocds
59+
# ocdsextensionregistry
60+
jsonschema==4.23.0
61+
# via
62+
# libcove
63+
# libcoveocds
64+
jsonschema-specifications==2024.10.1
65+
# via jsonschema
66+
libcove==0.32.1
67+
# via
68+
# libcoveoc4ids
69+
# libcoveocds
70+
libcoveoc4ids==0.9.0
71+
# via oc4ids-datastore-pipeline (pyproject.toml)
72+
libcoveocds==0.16.4
73+
# via libcoveoc4ids
74+
lxml==5.3.0
75+
# via flattentool
2776
mccabe==0.7.0
2877
# via flake8
2978
mypy==1.14.1
@@ -32,18 +81,32 @@ mypy-extensions==1.0.0
3281
# via
3382
# black
3483
# mypy
84+
ocdsextensionregistry==0.6.9
85+
# via libcoveocds
86+
odfpy==1.4.1
87+
# via flattentool
88+
openpyxl==3.1.5
89+
# via flattentool
3590
packaging==24.2
3691
# via
3792
# black
3893
# pytest
3994
pathspec==0.12.1
4095
# via black
96+
persistent==6.1
97+
# via
98+
# btrees
99+
# zodb
41100
platformdirs==4.3.6
42-
# via black
101+
# via
102+
# black
103+
# requests-cache
43104
pluggy==1.5.0
44105
# via pytest
45106
pycodestyle==2.12.1
46107
# via flake8
108+
pycparser==2.22
109+
# via cffi
47110
pyflakes==3.2.0
48111
# via flake8
49112
pytest==8.3.4
@@ -52,13 +115,78 @@ pytest==8.3.4
52115
# pytest-mock
53116
pytest-mock==3.14.0
54117
# via oc4ids-datastore-pipeline (pyproject.toml)
118+
pytz==2025.1
119+
# via flattentool
120+
referencing==0.36.2
121+
# via
122+
# jsonschema
123+
# jsonschema-specifications
124+
# libcove
125+
# libcoveocds
55126
requests==2.32.3
56-
# via oc4ids-datastore-pipeline (pyproject.toml)
127+
# via
128+
# libcove
129+
# libcoveocds
130+
# oc4ids-datastore-pipeline (pyproject.toml)
131+
# ocdsextensionregistry
132+
# requests-cache
133+
requests-cache==1.2.1
134+
# via ocdsextensionregistry
135+
rfc3339-validator==0.1.4
136+
# via libcove
137+
rfc3987==1.3.8
138+
# via libcove
139+
rpds-py==0.22.3
140+
# via
141+
# jsonschema
142+
# referencing
143+
schema==0.7.7
144+
# via flattentool
145+
six==1.17.0
146+
# via
147+
# rfc3339-validator
148+
# url-normalize
149+
transaction==5.0
150+
# via zodb
57151
types-requests==2.32.0.20241016
58152
# via oc4ids-datastore-pipeline (pyproject.toml)
59153
typing-extensions==4.12.2
60-
# via mypy
154+
# via
155+
# mypy
156+
# referencing
157+
url-normalize==1.4.3
158+
# via requests-cache
61159
urllib3==2.3.0
62160
# via
63161
# requests
162+
# requests-cache
64163
# types-requests
164+
xmltodict==0.14.2
165+
# via flattentool
166+
zc-lockfile==3.0.post1
167+
# via zodb
168+
zc-zlibstorage==1.2.0
169+
# via flattentool
170+
zconfig==4.2
171+
# via zodb
172+
zodb==6.0
173+
# via
174+
# flattentool
175+
# zc-zlibstorage
176+
zodbpickle==4.1.1
177+
# via zodb
178+
zope-deferredimport==5.0
179+
# via persistent
180+
zope-interface==7.2
181+
# via
182+
# btrees
183+
# persistent
184+
# transaction
185+
# zc-zlibstorage
186+
# zodb
187+
# zope-proxy
188+
zope-proxy==6.1
189+
# via zope-deferredimport
190+
191+
# The following packages are considered to be unsafe in a requirements file:
192+
# setuptools

tests/test_pipeline.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import pytest
22
from pytest_mock import MockerFixture
33

4-
from oc4ids_datastore_pipeline.pipeline import download_json, process_dataset
4+
from oc4ids_datastore_pipeline.pipeline import (
5+
download_json,
6+
process_dataset,
7+
validate_json,
8+
)
59

610

711
def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None:
@@ -15,6 +19,34 @@ def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None:
1519
assert "Mocked exception" in str(exc_info.value)
1620

1721

22+
def test_validate_json_raises_failure_exception(mocker: MockerFixture) -> None:
23+
patch_oc4ids_json_output = mocker.patch(
24+
"oc4ids_datastore_pipeline.pipeline.oc4ids_json_output"
25+
)
26+
patch_oc4ids_json_output.side_effect = Exception("Mocked exception")
27+
28+
with pytest.raises(Exception) as exc_info:
29+
validate_json(dataset_name="test_dataset", json_data={})
30+
31+
assert "Validation failed" in str(exc_info.value)
32+
assert "Mocked exception" in str(exc_info.value)
33+
34+
35+
def test_validate_json_raises_validation_errors_exception(
36+
mocker: MockerFixture,
37+
) -> None:
38+
patch_oc4ids_json_output = mocker.patch(
39+
"oc4ids_datastore_pipeline.pipeline.oc4ids_json_output"
40+
)
41+
patch_oc4ids_json_output.return_value = {"validation_errors_count": 2}
42+
43+
with pytest.raises(Exception) as exc_info:
44+
validate_json(dataset_name="test_dataset", json_data={})
45+
46+
assert "Validation failed" in str(exc_info.value)
47+
assert "Dataset has 2 validation errors" in str(exc_info.value)
48+
49+
1850
def test_process_dataset_catches_exception(mocker: MockerFixture) -> None:
1951
patch_download_json = mocker.patch(
2052
"oc4ids_datastore_pipeline.pipeline.download_json"

0 commit comments

Comments
 (0)