Skip to content

Commit 6d186ef

Browse files
Merge pull request #17 from OpenDataServices/5-licenses
Save license metadata to database
2 parents e1256ae + 0b73826 commit 6d186ef

File tree

6 files changed

+241
-57
lines changed

6 files changed

+241
-57
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""add license columns to dataset table
2+
3+
Revision ID: 85905d23accc
4+
Revises: aaabf849b37f
5+
Create Date: 2025-02-05 09:45:04.056529
6+
7+
"""
8+
9+
from typing import Sequence, Union
10+
11+
from alembic import op
12+
import sqlalchemy as sa
13+
14+
15+
# revision identifiers, used by Alembic.
16+
revision: str = "85905d23accc"
17+
down_revision: Union[str, None] = "aaabf849b37f"
18+
branch_labels: Union[str, Sequence[str], None] = None
19+
depends_on: Union[str, Sequence[str], None] = None
20+
21+
22+
def upgrade() -> None:
23+
# ### commands auto generated by Alembic - please adjust! ###
24+
op.add_column("dataset", sa.Column("license_url", sa.String(), nullable=True))
25+
op.add_column("dataset", sa.Column("license_name", sa.String(), nullable=True))
26+
# ### end Alembic commands ###
27+
28+
29+
def downgrade() -> None:
30+
# ### commands auto generated by Alembic - please adjust! ###
31+
op.drop_column("dataset", "license_name")
32+
op.drop_column("dataset", "license_url")
33+
# ### end Alembic commands ###

oc4ids_datastore_pipeline/database.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import datetime
22
import logging
33
import os
4+
from typing import Optional
45

56
from sqlalchemy import (
67
DateTime,
@@ -25,6 +26,8 @@ class Dataset(Base):
2526
dataset_id: Mapped[str] = mapped_column(String, primary_key=True)
2627
source_url: Mapped[str] = mapped_column(String)
2728
publisher_name: Mapped[str] = mapped_column(String)
29+
license_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
30+
license_name: Mapped[Optional[str]] = mapped_column(String, nullable=True)
2831
json_url: Mapped[str] = mapped_column(String)
2932
updated_at: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True))
3033

oc4ids_datastore_pipeline/pipeline.py

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,28 +8,14 @@
88
from libcoveoc4ids.api import oc4ids_json_output
99

1010
from oc4ids_datastore_pipeline.database import Dataset, save_dataset
11+
from oc4ids_datastore_pipeline.registry import (
12+
fetch_registered_datasets,
13+
get_license_name_from_url,
14+
)
1115

1216
logger = logging.getLogger(__name__)
1317

1418

15-
def fetch_registered_datasets() -> dict[str, str]:
16-
logger.info("Fetching registered datasets list from registry")
17-
try:
18-
url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/dataset/records_api.json" # noqa: E501
19-
r = requests.get(url)
20-
r.raise_for_status()
21-
json_data = r.json()
22-
registered_datasets = {
23-
key: value["fields"]["url"]["value"]
24-
for (key, value) in json_data["records"].items()
25-
}
26-
registered_datasets_count = len(registered_datasets)
27-
logger.info(f"Fetched URLs for {registered_datasets_count} datasets")
28-
return registered_datasets
29-
except Exception as e:
30-
raise Exception("Failed to fetch datasets list from registry", e)
31-
32-
3319
def download_json(url: str) -> Any:
3420
logger.info(f"Downloading json from {url}")
3521
try:
@@ -42,7 +28,7 @@ def download_json(url: str) -> Any:
4228
raise Exception("Download failed", e)
4329

4430

45-
def validate_json(dataset_name: str, json_data: Any) -> None:
31+
def validate_json(dataset_name: str, json_data: dict[str, Any]) -> None:
4632
logger.info(f"Validating dataset {dataset_name}")
4733
try:
4834
validation_result = oc4ids_json_output(json_data=json_data)
@@ -54,26 +40,32 @@ def validate_json(dataset_name: str, json_data: Any) -> None:
5440
raise Exception("Validation failed", e)
5541

5642

57-
def write_json_to_file(file_name: str, json_data: Any) -> None:
43+
def write_json_to_file(file_name: str, json_data: dict[str, Any]) -> str:
5844
logger.info(f"Writing dataset to file {file_name}")
5945
try:
6046
os.makedirs(os.path.dirname(file_name), exist_ok=True)
6147
with open(file_name, "w") as file:
6248
json.dump(json_data, file, indent=4)
6349
logger.info(f"Finished writing to {file_name}")
50+
return file_name
6451
except Exception as e:
6552
raise Exception("Error while writing to JSON file", e)
6653

6754

6855
def save_dataset_metadata(
69-
dataset_name: str, source_url: str, publisher_name: str, file_name: str
56+
dataset_name: str, source_url: str, json_data: dict[str, Any], json_url: str
7057
) -> None:
7158
logger.info(f"Saving metadata for dataset {dataset_name}")
59+
publisher_name = json_data.get("publisher", {}).get("name", "")
60+
license_url = json_data.get("license", None)
61+
license_name = get_license_name_from_url(license_url) if license_url else None
7262
dataset = Dataset(
7363
dataset_id=dataset_name,
7464
source_url=source_url,
7565
publisher_name=publisher_name,
76-
json_url=file_name,
66+
license_url=license_url,
67+
license_name=license_name,
68+
json_url=json_url,
7769
updated_at=datetime.datetime.now(datetime.UTC),
7870
)
7971
save_dataset(dataset)
@@ -84,14 +76,12 @@ def process_dataset(dataset_name: str, dataset_url: str) -> None:
8476
try:
8577
json_data = download_json(dataset_url)
8678
validate_json(dataset_name, json_data)
87-
file_name = f"data/{dataset_name}.json"
88-
write_json_to_file(file_name, json_data)
89-
publisher_name = json_data.get("publisher", {}).get("name", "")
79+
json_url = write_json_to_file(f"data/{dataset_name}.json", json_data)
9080
save_dataset_metadata(
9181
dataset_name=dataset_name,
9282
source_url=dataset_url,
93-
publisher_name=publisher_name,
94-
file_name=file_name,
83+
json_data=json_data,
84+
json_url=json_url,
9585
)
9686
logger.info(f"Processed dataset {dataset_name}")
9787
except Exception as e:
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import logging
2+
from typing import Optional
3+
4+
import requests
5+
6+
logger = logging.getLogger(__name__)
7+
8+
9+
_license_mappings = None
10+
11+
12+
def fetch_registered_datasets() -> dict[str, str]:
13+
logger.info("Fetching registered datasets list from registry")
14+
try:
15+
url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/dataset/records_api.json" # noqa: E501
16+
r = requests.get(url)
17+
r.raise_for_status()
18+
json_data = r.json()
19+
registered_datasets = {
20+
key: value["fields"]["url"]["value"]
21+
for (key, value) in json_data["records"].items()
22+
}
23+
registered_datasets_count = len(registered_datasets)
24+
logger.info(f"Fetched URLs for {registered_datasets_count} datasets")
25+
return registered_datasets
26+
except Exception as e:
27+
raise Exception("Failed to fetch datasets list from registry", e)
28+
29+
30+
def fetch_license_mappings() -> dict[str, str]:
31+
logger.info("Fetching license mappings from registry")
32+
try:
33+
url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/license/records_api.json" # noqa: E501
34+
r = requests.get(url)
35+
r.raise_for_status()
36+
json_data = r.json()
37+
return {
38+
urls["fields"]["url"]["value"]: license["fields"]["title"]["value"]
39+
for license in json_data["records"].values()
40+
for urls in license["fields"]["urls"]["values"]
41+
}
42+
except Exception as e:
43+
logger.warning(
44+
"Failed to fetch license mappings from registry, with error: " + str(e),
45+
)
46+
return {}
47+
48+
49+
def get_license_name_from_url(
50+
url: str, force_refresh: Optional[bool] = False
51+
) -> Optional[str]:
52+
global _license_mappings
53+
if force_refresh or (_license_mappings is None):
54+
_license_mappings = fetch_license_mappings()
55+
return _license_mappings.get(url, None)

tests/test_pipeline.py

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,18 @@
11
import os
22
import tempfile
33
from textwrap import dedent
4-
from unittest.mock import MagicMock
54

65
import pytest
76
from pytest_mock import MockerFixture
87

98
from oc4ids_datastore_pipeline.pipeline import (
109
download_json,
11-
fetch_registered_datasets,
1210
process_dataset,
1311
validate_json,
1412
write_json_to_file,
1513
)
1614

1715

18-
def test_fetch_registered_datasets(mocker: MockerFixture) -> None:
19-
mock_response = MagicMock()
20-
mock_response.json.return_value = {
21-
"records": {
22-
"test_dataset": {"fields": {"url": {"value": "https://test_dataset.json"}}}
23-
}
24-
}
25-
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
26-
patch_get.return_value = mock_response
27-
28-
result = fetch_registered_datasets()
29-
30-
assert result == {"test_dataset": "https://test_dataset.json"}
31-
32-
33-
def test_fetch_registered_datasets_raises_failure_exception(
34-
mocker: MockerFixture,
35-
) -> None:
36-
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
37-
patch_get.side_effect = Exception("Mocked exception")
38-
39-
with pytest.raises(Exception) as exc_info:
40-
fetch_registered_datasets()
41-
42-
assert "Failed to fetch datasets list from registry" in str(exc_info.value)
43-
assert "Mocked exception" in str(exc_info.value)
44-
45-
4616
def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None:
4717
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
4818
patch_get.side_effect = Exception("Mocked exception")

tests/test_registry.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
from unittest.mock import MagicMock
2+
3+
import pytest
4+
from pytest_mock import MockerFixture
5+
6+
from oc4ids_datastore_pipeline.registry import (
7+
fetch_license_mappings,
8+
fetch_registered_datasets,
9+
get_license_name_from_url,
10+
)
11+
12+
13+
def test_fetch_registered_datasets(mocker: MockerFixture) -> None:
14+
mock_response = MagicMock()
15+
mock_response.json.return_value = {
16+
"records": {
17+
"test_dataset": {"fields": {"url": {"value": "https://test_dataset.json"}}}
18+
}
19+
}
20+
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
21+
patch_get.return_value = mock_response
22+
23+
result = fetch_registered_datasets()
24+
25+
assert result == {"test_dataset": "https://test_dataset.json"}
26+
27+
28+
def test_fetch_registered_datasets_raises_failure_exception(
29+
mocker: MockerFixture,
30+
) -> None:
31+
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
32+
patch_get.side_effect = Exception("Mocked exception")
33+
34+
with pytest.raises(Exception) as exc_info:
35+
fetch_registered_datasets()
36+
37+
assert "Failed to fetch datasets list from registry" in str(exc_info.value)
38+
assert "Mocked exception" in str(exc_info.value)
39+
40+
41+
def test_fetch_license_mappings(mocker: MockerFixture) -> None:
42+
mock_response = MagicMock()
43+
mock_response.json.return_value = {
44+
"records": {
45+
"license_1": {
46+
"fields": {
47+
"title": {"value": "License 1"},
48+
"urls": {
49+
"values": [
50+
{
51+
"fields": {
52+
"url": {"value": "https://license_1.com/license"}
53+
}
54+
},
55+
{
56+
"fields": {
57+
"url": {
58+
"value": "https://license_1.com/different_url"
59+
}
60+
}
61+
},
62+
]
63+
},
64+
}
65+
},
66+
"license_2": {
67+
"fields": {
68+
"title": {"value": "License 2"},
69+
"urls": {
70+
"values": [
71+
{
72+
"fields": {
73+
"url": {"value": "https://license_2.com/license"}
74+
}
75+
},
76+
]
77+
},
78+
}
79+
},
80+
}
81+
}
82+
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
83+
patch_get.return_value = mock_response
84+
85+
result = fetch_license_mappings()
86+
87+
assert result == {
88+
"https://license_1.com/license": "License 1",
89+
"https://license_1.com/different_url": "License 1",
90+
"https://license_2.com/license": "License 2",
91+
}
92+
93+
94+
def test_fetch_license_mappings_catches_exception(
95+
mocker: MockerFixture,
96+
) -> None:
97+
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
98+
patch_get.side_effect = Exception("Mocked exception")
99+
100+
result = fetch_license_mappings()
101+
102+
assert result == {}
103+
104+
105+
def test_get_license_name_from_url(mocker: MockerFixture) -> None:
106+
patch_license_mappings = mocker.patch(
107+
"oc4ids_datastore_pipeline.registry.fetch_license_mappings"
108+
)
109+
patch_license_mappings.return_value = {
110+
"https://license_1.com/license": "License 1",
111+
"https://license_2.com/license": "License 2",
112+
}
113+
114+
license_name = get_license_name_from_url(
115+
"https://license_2.com/license", force_refresh=True
116+
)
117+
118+
assert license_name == "License 2"
119+
120+
121+
def test_get_license_name_from_url_not_in_mapping(mocker: MockerFixture) -> None:
122+
patch_license_mappings = mocker.patch(
123+
"oc4ids_datastore_pipeline.registry.fetch_license_mappings"
124+
)
125+
patch_license_mappings.return_value = {
126+
"https://license_1.com/license": "License 1",
127+
}
128+
129+
license_name = get_license_name_from_url(
130+
"https://license_2.com/license", force_refresh=True
131+
)
132+
133+
assert license_name is None

0 commit comments

Comments
 (0)