Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""rename column license name to license title
Revision ID: 8182d8c386f7
Revises: 3499656b84e7
Create Date: 2025-02-25 15:40:01.727396
"""

from typing import Sequence, Union

from alembic import op


# revision identifiers, used by Alembic.
revision: str = "8182d8c386f7"
down_revision: Union[str, None] = "3499656b84e7"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.alter_column(
"dataset", "license_name", nullable=True, new_column_name="license_title"
)
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.alter_column(
"dataset", "license_title", nullable=True, new_column_name="license_name"
)
# ### end Alembic commands ###
39 changes: 39 additions & 0 deletions migrations/versions/ebb26242c904_add_license_title_short_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""add license_title_short column

Revision ID: ebb26242c904
Revises: 8182d8c386f7
Create Date: 2025-02-25 15:47:03.169950

"""

from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = "ebb26242c904"
down_revision: Union[str, None] = "8182d8c386f7"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"dataset", sa.Column("license_title_short", sa.String(), nullable=True)
)
op.alter_column(
"dataset", "license_title", existing_type=sa.VARCHAR(), nullable=True
)
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.alter_column(
"dataset", "license_title", existing_type=sa.VARCHAR(), nullable=False
)
op.drop_column("dataset", "license_title_short")
# ### end Alembic commands ###
3 changes: 2 additions & 1 deletion oc4ids_datastore_pipeline/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ class Dataset(Base):
source_url: Mapped[str] = mapped_column(String)
publisher_name: Mapped[str] = mapped_column(String)
license_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
license_name: Mapped[Optional[str]] = mapped_column(String, nullable=True)
license_title: Mapped[Optional[str]] = mapped_column(String, nullable=True)
license_title_short: Mapped[Optional[str]] = mapped_column(String, nullable=True)
json_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
csv_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
xlsx_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
Expand Down
9 changes: 6 additions & 3 deletions oc4ids_datastore_pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from oc4ids_datastore_pipeline.notifications import send_notification
from oc4ids_datastore_pipeline.registry import (
fetch_registered_datasets,
get_license_name_from_url,
get_license_title_from_url,
)
from oc4ids_datastore_pipeline.storage import delete_files_for_dataset, upload_files

Expand Down Expand Up @@ -108,13 +108,16 @@ def save_dataset_metadata(
try:
publisher_name = json_data.get("publisher", {}).get("name", "")
license_url = json_data.get("license", None)
license_name = get_license_name_from_url(license_url) if license_url else None
license_title, license_title_short = (
get_license_title_from_url(license_url) if license_url else (None, None)
)
dataset = Dataset(
dataset_id=dataset_id,
source_url=source_url,
publisher_name=publisher_name,
license_url=license_url,
license_name=license_name,
license_title=license_title,
license_title_short=license_title_short,
json_url=json_url,
csv_url=csv_url,
xlsx_url=xlsx_url,
Expand Down
14 changes: 9 additions & 5 deletions oc4ids_datastore_pipeline/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,18 @@ def fetch_registered_datasets() -> dict[str, str]:
return registered_datasets


def fetch_license_mappings() -> dict[str, str]:
def fetch_license_mappings() -> dict[str, dict[str, Optional[str]]]:
logger.info("Fetching license mappings from registry")
try:
url = "https://opendataservices.github.io/oc4ids-registry/datatig/type/license/records_api.json" # noqa: E501
r = requests.get(url)
r.raise_for_status()
json_data = r.json()
return {
urls["fields"]["url"]["value"]: license["fields"]["title"]["value"]
urls["fields"]["url"]["value"]: {
"title": license["fields"]["title"]["value"],
"title_short": license["fields"]["title_short"]["value"],
}
for license in json_data["records"].values()
for urls in license["fields"]["urls"]["values"]
}
Expand All @@ -50,10 +53,11 @@ def fetch_license_mappings() -> dict[str, str]:
return {}


def get_license_name_from_url(
def get_license_title_from_url(
url: str, force_refresh: Optional[bool] = False
) -> Optional[str]:
) -> tuple[Optional[str], Optional[str]]:
global _license_mappings
if force_refresh or (_license_mappings is None):
_license_mappings = fetch_license_mappings()
return _license_mappings.get(url, None)
license_titles = _license_mappings.get(url, {})
return license_titles.get("title"), license_titles.get("title_short")
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "flit_core.buildapi"
[project]
name = "oc4ids-datastore-pipeline"
description = "OC4IDS Datastore Pipeline"
version = "0.2.0"
version = "0.3.0"
readme = "README.md"
dependencies = [
"alembic",
Expand Down
65 changes: 52 additions & 13 deletions tests/test_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from oc4ids_datastore_pipeline.registry import (
fetch_license_mappings,
fetch_registered_datasets,
get_license_name_from_url,
get_license_title_from_url,
)


Expand Down Expand Up @@ -59,6 +59,7 @@ def test_fetch_license_mappings(mocker: MockerFixture) -> None:
"license_1": {
"fields": {
"title": {"value": "License 1"},
"title_short": {"value": "L1"},
"urls": {
"values": [
{
Expand All @@ -80,6 +81,7 @@ def test_fetch_license_mappings(mocker: MockerFixture) -> None:
"license_2": {
"fields": {
"title": {"value": "License 2"},
"title_short": {"value": "L2"},
"urls": {
"values": [
{
Expand All @@ -99,9 +101,18 @@ def test_fetch_license_mappings(mocker: MockerFixture) -> None:
result = fetch_license_mappings()

assert result == {
"https://license_1.com/license": "License 1",
"https://license_1.com/different_url": "License 1",
"https://license_2.com/license": "License 2",
"https://license_1.com/license": {
"title": "License 1",
"title_short": "L1",
},
"https://license_1.com/different_url": {
"title": "License 1",
"title_short": "L1",
},
"https://license_2.com/license": {
"title": "License 2",
"title_short": "L2",
},
}


Expand All @@ -116,32 +127,60 @@ def test_fetch_license_mappings_catches_exception(
assert result == {}


def test_get_license_name_from_url(mocker: MockerFixture) -> None:
def test_get_license_title_from_url(mocker: MockerFixture) -> None:
patch_license_mappings = mocker.patch(
"oc4ids_datastore_pipeline.registry.fetch_license_mappings"
)
patch_license_mappings.return_value = {
"https://license_1.com/license": "License 1",
"https://license_2.com/license": "License 2",
"https://license_1.com/license": {
"title": "License 1",
"title_short": "L1",
},
"https://license_2.com/license": {
"title": "License 2",
"title_short": "L2",
},
}

license_name = get_license_name_from_url(
license_title = get_license_title_from_url(
"https://license_2.com/license", force_refresh=True
)

assert license_name == "License 2"
assert license_title == ("License 2", "L2")


def test_get_license_name_from_url_not_in_mapping(mocker: MockerFixture) -> None:
def test_get_license_title_from_url_not_in_mapping(mocker: MockerFixture) -> None:
patch_license_mappings = mocker.patch(
"oc4ids_datastore_pipeline.registry.fetch_license_mappings"
)
patch_license_mappings.return_value = {
"https://license_1.com/license": "License 1",
"https://license_1.com/license": {
"title": "License 1",
"title_short": "L1",
},
}

license_name = get_license_name_from_url(
license_title = get_license_title_from_url(
"https://license_2.com/license", force_refresh=True
)

assert license_name is None
assert license_title == (None, None)


def test_get_license_name_from_url_short_name_not_in_mapping(
mocker: MockerFixture,
) -> None:
patch_license_mappings = mocker.patch(
"oc4ids_datastore_pipeline.registry.fetch_license_mappings"
)
patch_license_mappings.return_value = {
"https://license_2.com/license": {
"title": "License 2",
},
}

license_title = get_license_title_from_url(
"https://license_2.com/license", force_refresh=True
)

assert license_title == ("License 2", None)