From f669602743a152965e1ba382bf5a1802ee78c3ec Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 4 Feb 2025 16:18:45 +0200 Subject: [PATCH 1/4] feat: fetch license mappings from registry --- oc4ids_datastore_pipeline/pipeline.py | 19 ++++++++ tests/test_pipeline.py | 65 +++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index 3b4750d..a45f96c 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -30,6 +30,25 @@ def fetch_registered_datasets() -> dict[str, str]: raise Exception("Failed to fetch datasets list from registry", e) +def fetch_license_mappings() -> dict[str, str]: + logger.info("Fetching license mappings from registry") + try: + url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/license/records_api.json" # noqa: E501 + r = requests.get(url) + r.raise_for_status() + json_data = r.json() + return { + urls["fields"]["url"]["value"]: license["fields"]["title"]["value"] + for license in json_data["records"].values() + for urls in license["fields"]["urls"]["values"] + } + except Exception as e: + logger.warning( + "Failed to fetch license mappings from registry, with error: " + str(e), + ) + return {} + + def download_json(url: str) -> Any: logger.info(f"Downloading json from {url}") try: diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index b3b2179..575d2d6 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -8,6 +8,7 @@ from oc4ids_datastore_pipeline.pipeline import ( download_json, + fetch_license_mappings, fetch_registered_datasets, process_dataset, validate_json, @@ -43,6 +44,70 @@ def test_fetch_registered_datasets_raises_failure_exception( assert "Mocked exception" in str(exc_info.value) +def test_fetch_license_mappings(mocker: MockerFixture) -> None: + mock_response = MagicMock() + mock_response.json.return_value = { + "records": { + "license_1": { + "fields": { + "title": {"value": "License 1"}, + "urls": { + "values": [ + { + "fields": { + "url": {"value": "https://license_1.com/license"} + } + }, + { + "fields": { + "url": { + "value": "https://license_1.com/different_url" + } + } + }, + ] + }, + } + }, + "license_2": { + "fields": { + "title": {"value": "License 2"}, + "urls": { + "values": [ + { + "fields": { + "url": {"value": "https://license_2.com/license"} + } + }, + ] + }, + } + }, + } + } + patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") + patch_get.return_value = mock_response + + result = fetch_license_mappings() + + assert result == { + "https://license_1.com/license": "License 1", + "https://license_1.com/different_url": "License 1", + "https://license_2.com/license": "License 2", + } + + +def test_fetch_license_mappings_catches_exception( + mocker: MockerFixture, +) -> None: + patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") + patch_get.side_effect = Exception("Mocked exception") + + result = fetch_license_mappings() + + assert result == {} + + def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None: patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") patch_get.side_effect = Exception("Mocked exception") From d65b946db36893d7f9b394d3dfe4e245e5be71d2 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 4 Feb 2025 16:31:35 +0200 Subject: [PATCH 2/4] refactor: move registry logic to separate module --- oc4ids_datastore_pipeline/pipeline.py | 38 +--------- oc4ids_datastore_pipeline/registry.py | 42 +++++++++++ tests/test_pipeline.py | 95 ------------------------ tests/test_registry.py | 101 ++++++++++++++++++++++++++ 4 files changed, 144 insertions(+), 132 deletions(-) create mode 100644 oc4ids_datastore_pipeline/registry.py create mode 100644 tests/test_registry.py diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index a45f96c..03d6ab5 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -8,47 +8,11 @@ from libcoveoc4ids.api import oc4ids_json_output from oc4ids_datastore_pipeline.database import Dataset, save_dataset +from oc4ids_datastore_pipeline.registry import fetch_registered_datasets logger = logging.getLogger(__name__) -def fetch_registered_datasets() -> dict[str, str]: - logger.info("Fetching registered datasets list from registry") - try: - url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/dataset/records_api.json" # noqa: E501 - r = requests.get(url) - r.raise_for_status() - json_data = r.json() - registered_datasets = { - key: value["fields"]["url"]["value"] - for (key, value) in json_data["records"].items() - } - registered_datasets_count = len(registered_datasets) - logger.info(f"Fetched URLs for {registered_datasets_count} datasets") - return registered_datasets - except Exception as e: - raise Exception("Failed to fetch datasets list from registry", e) - - -def fetch_license_mappings() -> dict[str, str]: - logger.info("Fetching license mappings from registry") - try: - url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/license/records_api.json" # noqa: E501 - r = requests.get(url) - r.raise_for_status() - json_data = r.json() - return { - urls["fields"]["url"]["value"]: license["fields"]["title"]["value"] - for license in json_data["records"].values() - for urls in license["fields"]["urls"]["values"] - } - except Exception as e: - logger.warning( - "Failed to fetch license mappings from registry, with error: " + str(e), - ) - return {} - - def download_json(url: str) -> Any: logger.info(f"Downloading json from {url}") try: diff --git a/oc4ids_datastore_pipeline/registry.py b/oc4ids_datastore_pipeline/registry.py new file mode 100644 index 0000000..783dc2c --- /dev/null +++ b/oc4ids_datastore_pipeline/registry.py @@ -0,0 +1,42 @@ +import logging + +import requests + +logger = logging.getLogger(__name__) + + +def fetch_registered_datasets() -> dict[str, str]: + logger.info("Fetching registered datasets list from registry") + try: + url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/dataset/records_api.json" # noqa: E501 + r = requests.get(url) + r.raise_for_status() + json_data = r.json() + registered_datasets = { + key: value["fields"]["url"]["value"] + for (key, value) in json_data["records"].items() + } + registered_datasets_count = len(registered_datasets) + logger.info(f"Fetched URLs for {registered_datasets_count} datasets") + return registered_datasets + except Exception as e: + raise Exception("Failed to fetch datasets list from registry", e) + + +def fetch_license_mappings() -> dict[str, str]: + logger.info("Fetching license mappings from registry") + try: + url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/license/records_api.json" # noqa: E501 + r = requests.get(url) + r.raise_for_status() + json_data = r.json() + return { + urls["fields"]["url"]["value"]: license["fields"]["title"]["value"] + for license in json_data["records"].values() + for urls in license["fields"]["urls"]["values"] + } + except Exception as e: + logger.warning( + "Failed to fetch license mappings from registry, with error: " + str(e), + ) + return {} diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 575d2d6..281c5f4 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,113 +1,18 @@ import os import tempfile from textwrap import dedent -from unittest.mock import MagicMock import pytest from pytest_mock import MockerFixture from oc4ids_datastore_pipeline.pipeline import ( download_json, - fetch_license_mappings, - fetch_registered_datasets, process_dataset, validate_json, write_json_to_file, ) -def test_fetch_registered_datasets(mocker: MockerFixture) -> None: - mock_response = MagicMock() - mock_response.json.return_value = { - "records": { - "test_dataset": {"fields": {"url": {"value": "https://test_dataset.json"}}} - } - } - patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") - patch_get.return_value = mock_response - - result = fetch_registered_datasets() - - assert result == {"test_dataset": "https://test_dataset.json"} - - -def test_fetch_registered_datasets_raises_failure_exception( - mocker: MockerFixture, -) -> None: - patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") - patch_get.side_effect = Exception("Mocked exception") - - with pytest.raises(Exception) as exc_info: - fetch_registered_datasets() - - assert "Failed to fetch datasets list from registry" in str(exc_info.value) - assert "Mocked exception" in str(exc_info.value) - - -def test_fetch_license_mappings(mocker: MockerFixture) -> None: - mock_response = MagicMock() - mock_response.json.return_value = { - "records": { - "license_1": { - "fields": { - "title": {"value": "License 1"}, - "urls": { - "values": [ - { - "fields": { - "url": {"value": "https://license_1.com/license"} - } - }, - { - "fields": { - "url": { - "value": "https://license_1.com/different_url" - } - } - }, - ] - }, - } - }, - "license_2": { - "fields": { - "title": {"value": "License 2"}, - "urls": { - "values": [ - { - "fields": { - "url": {"value": "https://license_2.com/license"} - } - }, - ] - }, - } - }, - } - } - patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") - patch_get.return_value = mock_response - - result = fetch_license_mappings() - - assert result == { - "https://license_1.com/license": "License 1", - "https://license_1.com/different_url": "License 1", - "https://license_2.com/license": "License 2", - } - - -def test_fetch_license_mappings_catches_exception( - mocker: MockerFixture, -) -> None: - patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") - patch_get.side_effect = Exception("Mocked exception") - - result = fetch_license_mappings() - - assert result == {} - - def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None: patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") patch_get.side_effect = Exception("Mocked exception") diff --git a/tests/test_registry.py b/tests/test_registry.py new file mode 100644 index 0000000..c94c985 --- /dev/null +++ b/tests/test_registry.py @@ -0,0 +1,101 @@ +from unittest.mock import MagicMock + +import pytest +from pytest_mock import MockerFixture + +from oc4ids_datastore_pipeline.registry import ( + fetch_license_mappings, + fetch_registered_datasets, +) + + +def test_fetch_registered_datasets(mocker: MockerFixture) -> None: + mock_response = MagicMock() + mock_response.json.return_value = { + "records": { + "test_dataset": {"fields": {"url": {"value": "https://test_dataset.json"}}} + } + } + patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") + patch_get.return_value = mock_response + + result = fetch_registered_datasets() + + assert result == {"test_dataset": "https://test_dataset.json"} + + +def test_fetch_registered_datasets_raises_failure_exception( + mocker: MockerFixture, +) -> None: + patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") + patch_get.side_effect = Exception("Mocked exception") + + with pytest.raises(Exception) as exc_info: + fetch_registered_datasets() + + assert "Failed to fetch datasets list from registry" in str(exc_info.value) + assert "Mocked exception" in str(exc_info.value) + + +def test_fetch_license_mappings(mocker: MockerFixture) -> None: + mock_response = MagicMock() + mock_response.json.return_value = { + "records": { + "license_1": { + "fields": { + "title": {"value": "License 1"}, + "urls": { + "values": [ + { + "fields": { + "url": {"value": "https://license_1.com/license"} + } + }, + { + "fields": { + "url": { + "value": "https://license_1.com/different_url" + } + } + }, + ] + }, + } + }, + "license_2": { + "fields": { + "title": {"value": "License 2"}, + "urls": { + "values": [ + { + "fields": { + "url": {"value": "https://license_2.com/license"} + } + }, + ] + }, + } + }, + } + } + patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") + patch_get.return_value = mock_response + + result = fetch_license_mappings() + + assert result == { + "https://license_1.com/license": "License 1", + "https://license_1.com/different_url": "License 1", + "https://license_2.com/license": "License 2", + } + + +def test_fetch_license_mappings_catches_exception( + mocker: MockerFixture, +) -> None: + patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") + patch_get.side_effect = Exception("Mocked exception") + + result = fetch_license_mappings() + + assert result == {} From 40117c6b5b2b0b83de78aaf458683971ed14cc06 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 4 Feb 2025 16:35:45 +0200 Subject: [PATCH 3/4] refactor: pull metadata out of json_data within function --- oc4ids_datastore_pipeline/pipeline.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index 03d6ab5..12be6cf 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -25,7 +25,7 @@ def download_json(url: str) -> Any: raise Exception("Download failed", e) -def validate_json(dataset_name: str, json_data: Any) -> None: +def validate_json(dataset_name: str, json_data: dict[str, Any]) -> None: logger.info(f"Validating dataset {dataset_name}") try: validation_result = oc4ids_json_output(json_data=json_data) @@ -37,26 +37,28 @@ def validate_json(dataset_name: str, json_data: Any) -> None: raise Exception("Validation failed", e) -def write_json_to_file(file_name: str, json_data: Any) -> None: +def write_json_to_file(file_name: str, json_data: dict[str, Any]) -> str: logger.info(f"Writing dataset to file {file_name}") try: os.makedirs(os.path.dirname(file_name), exist_ok=True) with open(file_name, "w") as file: json.dump(json_data, file, indent=4) logger.info(f"Finished writing to {file_name}") + return file_name except Exception as e: raise Exception("Error while writing to JSON file", e) def save_dataset_metadata( - dataset_name: str, source_url: str, publisher_name: str, file_name: str + dataset_name: str, source_url: str, json_data: dict[str, Any], json_url: str ) -> None: logger.info(f"Saving metadata for dataset {dataset_name}") + publisher_name = json_data.get("publisher", {}).get("name", "") dataset = Dataset( dataset_id=dataset_name, source_url=source_url, publisher_name=publisher_name, - json_url=file_name, + json_url=json_url, updated_at=datetime.datetime.now(datetime.UTC), ) save_dataset(dataset) @@ -67,14 +69,12 @@ def process_dataset(dataset_name: str, dataset_url: str) -> None: try: json_data = download_json(dataset_url) validate_json(dataset_name, json_data) - file_name = f"data/{dataset_name}.json" - write_json_to_file(file_name, json_data) - publisher_name = json_data.get("publisher", {}).get("name", "") + json_url = write_json_to_file(f"data/{dataset_name}.json", json_data) save_dataset_metadata( dataset_name=dataset_name, source_url=dataset_url, - publisher_name=publisher_name, - file_name=file_name, + json_data=json_data, + json_url=json_url, ) logger.info(f"Processed dataset {dataset_name}") except Exception as e: From 0b7382624c29d5d188f5fbd942c0b5d9577a3d12 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Wed, 5 Feb 2025 09:46:28 +0200 Subject: [PATCH 4/4] feat: save license metadata to database --- ...cc_add_license_columns_to_dataset_table.py | 33 +++++++++++++++++++ oc4ids_datastore_pipeline/database.py | 3 ++ oc4ids_datastore_pipeline/pipeline.py | 9 ++++- oc4ids_datastore_pipeline/registry.py | 13 ++++++++ tests/test_registry.py | 32 ++++++++++++++++++ 5 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 migrations/versions/85905d23accc_add_license_columns_to_dataset_table.py diff --git a/migrations/versions/85905d23accc_add_license_columns_to_dataset_table.py b/migrations/versions/85905d23accc_add_license_columns_to_dataset_table.py new file mode 100644 index 0000000..315df6f --- /dev/null +++ b/migrations/versions/85905d23accc_add_license_columns_to_dataset_table.py @@ -0,0 +1,33 @@ +"""add license columns to dataset table + +Revision ID: 85905d23accc +Revises: aaabf849b37f +Create Date: 2025-02-05 09:45:04.056529 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "85905d23accc" +down_revision: Union[str, None] = "aaabf849b37f" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("dataset", sa.Column("license_url", sa.String(), nullable=True)) + op.add_column("dataset", sa.Column("license_name", sa.String(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("dataset", "license_name") + op.drop_column("dataset", "license_url") + # ### end Alembic commands ### diff --git a/oc4ids_datastore_pipeline/database.py b/oc4ids_datastore_pipeline/database.py index 8c6f0dd..476bd00 100644 --- a/oc4ids_datastore_pipeline/database.py +++ b/oc4ids_datastore_pipeline/database.py @@ -1,6 +1,7 @@ import datetime import logging import os +from typing import Optional from sqlalchemy import ( DateTime, @@ -25,6 +26,8 @@ class Dataset(Base): dataset_id: Mapped[str] = mapped_column(String, primary_key=True) source_url: Mapped[str] = mapped_column(String) publisher_name: Mapped[str] = mapped_column(String) + license_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) + license_name: Mapped[Optional[str]] = mapped_column(String, nullable=True) json_url: Mapped[str] = mapped_column(String) updated_at: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True)) diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index 12be6cf..d58c420 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -8,7 +8,10 @@ from libcoveoc4ids.api import oc4ids_json_output from oc4ids_datastore_pipeline.database import Dataset, save_dataset -from oc4ids_datastore_pipeline.registry import fetch_registered_datasets +from oc4ids_datastore_pipeline.registry import ( + fetch_registered_datasets, + get_license_name_from_url, +) logger = logging.getLogger(__name__) @@ -54,10 +57,14 @@ def save_dataset_metadata( ) -> None: logger.info(f"Saving metadata for dataset {dataset_name}") publisher_name = json_data.get("publisher", {}).get("name", "") + license_url = json_data.get("license", None) + license_name = get_license_name_from_url(license_url) if license_url else None dataset = Dataset( dataset_id=dataset_name, source_url=source_url, publisher_name=publisher_name, + license_url=license_url, + license_name=license_name, json_url=json_url, updated_at=datetime.datetime.now(datetime.UTC), ) diff --git a/oc4ids_datastore_pipeline/registry.py b/oc4ids_datastore_pipeline/registry.py index 783dc2c..72fa14d 100644 --- a/oc4ids_datastore_pipeline/registry.py +++ b/oc4ids_datastore_pipeline/registry.py @@ -1,10 +1,14 @@ import logging +from typing import Optional import requests logger = logging.getLogger(__name__) +_license_mappings = None + + def fetch_registered_datasets() -> dict[str, str]: logger.info("Fetching registered datasets list from registry") try: @@ -40,3 +44,12 @@ def fetch_license_mappings() -> dict[str, str]: "Failed to fetch license mappings from registry, with error: " + str(e), ) return {} + + +def get_license_name_from_url( + url: str, force_refresh: Optional[bool] = False +) -> Optional[str]: + global _license_mappings + if force_refresh or (_license_mappings is None): + _license_mappings = fetch_license_mappings() + return _license_mappings.get(url, None) diff --git a/tests/test_registry.py b/tests/test_registry.py index c94c985..6398524 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -6,6 +6,7 @@ from oc4ids_datastore_pipeline.registry import ( fetch_license_mappings, fetch_registered_datasets, + get_license_name_from_url, ) @@ -99,3 +100,34 @@ def test_fetch_license_mappings_catches_exception( result = fetch_license_mappings() assert result == {} + + +def test_get_license_name_from_url(mocker: MockerFixture) -> None: + patch_license_mappings = mocker.patch( + "oc4ids_datastore_pipeline.registry.fetch_license_mappings" + ) + patch_license_mappings.return_value = { + "https://license_1.com/license": "License 1", + "https://license_2.com/license": "License 2", + } + + license_name = get_license_name_from_url( + "https://license_2.com/license", force_refresh=True + ) + + assert license_name == "License 2" + + +def test_get_license_name_from_url_not_in_mapping(mocker: MockerFixture) -> None: + patch_license_mappings = mocker.patch( + "oc4ids_datastore_pipeline.registry.fetch_license_mappings" + ) + patch_license_mappings.return_value = { + "https://license_1.com/license": "License 1", + } + + license_name = get_license_name_from_url( + "https://license_2.com/license", force_refresh=True + ) + + assert license_name is None