diff --git a/migrations/versions/8182d8c386f7_rename_column_license_name_to_license_.py b/migrations/versions/8182d8c386f7_rename_column_license_name_to_license_.py new file mode 100644 index 0000000..7a0dd69 --- /dev/null +++ b/migrations/versions/8182d8c386f7_rename_column_license_name_to_license_.py @@ -0,0 +1,34 @@ +"""rename column license name to license title + +Revision ID: 8182d8c386f7 +Revises: 3499656b84e7 +Create Date: 2025-02-25 15:40:01.727396 + +""" + +from typing import Sequence, Union + +from alembic import op + + +# revision identifiers, used by Alembic. +revision: str = "8182d8c386f7" +down_revision: Union[str, None] = "3499656b84e7" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "dataset", "license_name", nullable=True, new_column_name="license_title" + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "dataset", "license_title", nullable=True, new_column_name="license_name" + ) + # ### end Alembic commands ### diff --git a/migrations/versions/ebb26242c904_add_license_title_short_column.py b/migrations/versions/ebb26242c904_add_license_title_short_column.py new file mode 100644 index 0000000..fc28cd5 --- /dev/null +++ b/migrations/versions/ebb26242c904_add_license_title_short_column.py @@ -0,0 +1,39 @@ +"""add license_title_short column + +Revision ID: ebb26242c904 +Revises: 8182d8c386f7 +Create Date: 2025-02-25 15:47:03.169950 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "ebb26242c904" +down_revision: Union[str, None] = "8182d8c386f7" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "dataset", sa.Column("license_title_short", sa.String(), nullable=True) + ) + op.alter_column( + "dataset", "license_title", existing_type=sa.VARCHAR(), nullable=True + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "dataset", "license_title", existing_type=sa.VARCHAR(), nullable=False + ) + op.drop_column("dataset", "license_title_short") + # ### end Alembic commands ### diff --git a/oc4ids_datastore_pipeline/database.py b/oc4ids_datastore_pipeline/database.py index ddafb8f..cbd4180 100644 --- a/oc4ids_datastore_pipeline/database.py +++ b/oc4ids_datastore_pipeline/database.py @@ -29,7 +29,8 @@ class Dataset(Base): source_url: Mapped[str] = mapped_column(String) publisher_name: Mapped[str] = mapped_column(String) license_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) - license_name: Mapped[Optional[str]] = mapped_column(String, nullable=True) + license_title: Mapped[Optional[str]] = mapped_column(String, nullable=True) + license_title_short: Mapped[Optional[str]] = mapped_column(String, nullable=True) json_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) csv_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) xlsx_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index 58e3dea..3b3f63b 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -18,7 +18,7 @@ from oc4ids_datastore_pipeline.notifications import send_notification from oc4ids_datastore_pipeline.registry import ( fetch_registered_datasets, - get_license_name_from_url, + get_license_title_from_url, ) from oc4ids_datastore_pipeline.storage import delete_files_for_dataset, upload_files @@ -108,13 +108,16 @@ def save_dataset_metadata( try: publisher_name = json_data.get("publisher", {}).get("name", "") license_url = json_data.get("license", None) - license_name = get_license_name_from_url(license_url) if license_url else None + license_title, license_title_short = ( + get_license_title_from_url(license_url) if license_url else (None, None) + ) dataset = Dataset( dataset_id=dataset_id, source_url=source_url, publisher_name=publisher_name, license_url=license_url, - license_name=license_name, + license_title=license_title, + license_title_short=license_title_short, json_url=json_url, csv_url=csv_url, xlsx_url=xlsx_url, diff --git a/oc4ids_datastore_pipeline/registry.py b/oc4ids_datastore_pipeline/registry.py index d921e39..862c612 100644 --- a/oc4ids_datastore_pipeline/registry.py +++ b/oc4ids_datastore_pipeline/registry.py @@ -31,7 +31,7 @@ def fetch_registered_datasets() -> dict[str, str]: return registered_datasets -def fetch_license_mappings() -> dict[str, str]: +def fetch_license_mappings() -> dict[str, dict[str, Optional[str]]]: logger.info("Fetching license mappings from registry") try: url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/license/records_api.json" # noqa: E501 @@ -39,7 +39,10 @@ def fetch_license_mappings() -> dict[str, str]: r.raise_for_status() json_data = r.json() return { - urls["fields"]["url"]["value"]: license["fields"]["title"]["value"] + urls["fields"]["url"]["value"]: { + "title": license["fields"]["title"]["value"], + "title_short": license["fields"]["title_short"]["value"], + } for license in json_data["records"].values() for urls in license["fields"]["urls"]["values"] } @@ -50,10 +53,11 @@ def fetch_license_mappings() -> dict[str, str]: return {} -def get_license_name_from_url( +def get_license_title_from_url( url: str, force_refresh: Optional[bool] = False -) -> Optional[str]: +) -> tuple[Optional[str], Optional[str]]: global _license_mappings if force_refresh or (_license_mappings is None): _license_mappings = fetch_license_mappings() - return _license_mappings.get(url, None) + license_titles = _license_mappings.get(url, {}) + return license_titles.get("title"), license_titles.get("title_short") diff --git a/pyproject.toml b/pyproject.toml index 0cc3f85..6af756c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "flit_core.buildapi" [project] name = "oc4ids-datastore-pipeline" description = "OC4IDS Datastore Pipeline" -version = "0.2.0" +version = "0.3.0" readme = "README.md" dependencies = [ "alembic", diff --git a/tests/test_registry.py b/tests/test_registry.py index 77f9eb2..8e90267 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -6,7 +6,7 @@ from oc4ids_datastore_pipeline.registry import ( fetch_license_mappings, fetch_registered_datasets, - get_license_name_from_url, + get_license_title_from_url, ) @@ -59,6 +59,7 @@ def test_fetch_license_mappings(mocker: MockerFixture) -> None: "license_1": { "fields": { "title": {"value": "License 1"}, + "title_short": {"value": "L1"}, "urls": { "values": [ { @@ -80,6 +81,7 @@ def test_fetch_license_mappings(mocker: MockerFixture) -> None: "license_2": { "fields": { "title": {"value": "License 2"}, + "title_short": {"value": "L2"}, "urls": { "values": [ { @@ -99,9 +101,18 @@ def test_fetch_license_mappings(mocker: MockerFixture) -> None: result = fetch_license_mappings() assert result == { - "https://license_1.com/license": "License 1", - "https://license_1.com/different_url": "License 1", - "https://license_2.com/license": "License 2", + "https://license_1.com/license": { + "title": "License 1", + "title_short": "L1", + }, + "https://license_1.com/different_url": { + "title": "License 1", + "title_short": "L1", + }, + "https://license_2.com/license": { + "title": "License 2", + "title_short": "L2", + }, } @@ -116,32 +127,60 @@ def test_fetch_license_mappings_catches_exception( assert result == {} -def test_get_license_name_from_url(mocker: MockerFixture) -> None: +def test_get_license_title_from_url(mocker: MockerFixture) -> None: patch_license_mappings = mocker.patch( "oc4ids_datastore_pipeline.registry.fetch_license_mappings" ) patch_license_mappings.return_value = { - "https://license_1.com/license": "License 1", - "https://license_2.com/license": "License 2", + "https://license_1.com/license": { + "title": "License 1", + "title_short": "L1", + }, + "https://license_2.com/license": { + "title": "License 2", + "title_short": "L2", + }, } - license_name = get_license_name_from_url( + license_title = get_license_title_from_url( "https://license_2.com/license", force_refresh=True ) - assert license_name == "License 2" + assert license_title == ("License 2", "L2") -def test_get_license_name_from_url_not_in_mapping(mocker: MockerFixture) -> None: +def test_get_license_title_from_url_not_in_mapping(mocker: MockerFixture) -> None: patch_license_mappings = mocker.patch( "oc4ids_datastore_pipeline.registry.fetch_license_mappings" ) patch_license_mappings.return_value = { - "https://license_1.com/license": "License 1", + "https://license_1.com/license": { + "title": "License 1", + "title_short": "L1", + }, } - license_name = get_license_name_from_url( + license_title = get_license_title_from_url( "https://license_2.com/license", force_refresh=True ) - assert license_name is None + assert license_title == (None, None) + + +def test_get_license_name_from_url_short_name_not_in_mapping( + mocker: MockerFixture, +) -> None: + patch_license_mappings = mocker.patch( + "oc4ids_datastore_pipeline.registry.fetch_license_mappings" + ) + patch_license_mappings.return_value = { + "https://license_2.com/license": { + "title": "License 2", + }, + } + + license_title = get_license_title_from_url( + "https://license_2.com/license", force_refresh=True + ) + + assert license_title == ("License 2", None)