Skip to content

Commit 192a9e7

Browse files
feat: save license metadata to database
1 parent 9f65a39 commit 192a9e7

File tree

5 files changed

+89
-1
lines changed

5 files changed

+89
-1
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""add license columns to dataset table
2+
3+
Revision ID: 85905d23accc
4+
Revises: aaabf849b37f
5+
Create Date: 2025-02-05 09:45:04.056529
6+
7+
"""
8+
9+
from typing import Sequence, Union
10+
11+
from alembic import op
12+
import sqlalchemy as sa
13+
14+
15+
# revision identifiers, used by Alembic.
16+
revision: str = "85905d23accc"
17+
down_revision: Union[str, None] = "aaabf849b37f"
18+
branch_labels: Union[str, Sequence[str], None] = None
19+
depends_on: Union[str, Sequence[str], None] = None
20+
21+
22+
def upgrade() -> None:
23+
# ### commands auto generated by Alembic - please adjust! ###
24+
op.add_column("dataset", sa.Column("license_url", sa.String(), nullable=True))
25+
op.add_column("dataset", sa.Column("license_name", sa.String(), nullable=True))
26+
# ### end Alembic commands ###
27+
28+
29+
def downgrade() -> None:
30+
# ### commands auto generated by Alembic - please adjust! ###
31+
op.drop_column("dataset", "license_name")
32+
op.drop_column("dataset", "license_url")
33+
# ### end Alembic commands ###

oc4ids_datastore_pipeline/database.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import datetime
22
import logging
33
import os
4+
from typing import Optional
45

56
from sqlalchemy import (
67
DateTime,
@@ -25,6 +26,8 @@ class Dataset(Base):
2526
dataset_id: Mapped[str] = mapped_column(String, primary_key=True)
2627
source_url: Mapped[str] = mapped_column(String)
2728
publisher_name: Mapped[str] = mapped_column(String)
29+
license_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
30+
license_name: Mapped[Optional[str]] = mapped_column(String, nullable=True)
2831
json_url: Mapped[str] = mapped_column(String)
2932
updated_at: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True))
3033

oc4ids_datastore_pipeline/pipeline.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88
from libcoveoc4ids.api import oc4ids_json_output
99

1010
from oc4ids_datastore_pipeline.database import Dataset, save_dataset
11-
from oc4ids_datastore_pipeline.registry import fetch_registered_datasets
11+
from oc4ids_datastore_pipeline.registry import (
12+
fetch_registered_datasets,
13+
get_license_name_from_url,
14+
)
1215

1316
logger = logging.getLogger(__name__)
1417

@@ -54,10 +57,14 @@ def save_dataset_metadata(
5457
) -> None:
5558
logger.info(f"Saving metadata for dataset {dataset_name}")
5659
publisher_name = json_data.get("publisher", {}).get("name", "")
60+
license_url = json_data.get("license", None)
61+
license_name = get_license_name_from_url(license_url) if license_url else None
5762
dataset = Dataset(
5863
dataset_id=dataset_name,
5964
source_url=source_url,
6065
publisher_name=publisher_name,
66+
license_url=license_url,
67+
license_name=license_name,
6168
json_url=json_url,
6269
updated_at=datetime.datetime.now(datetime.UTC),
6370
)

oc4ids_datastore_pipeline/registry.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
import logging
2+
from typing import Optional
23

34
import requests
45

56
logger = logging.getLogger(__name__)
67

78

9+
_license_mappings = None
10+
11+
812
def fetch_registered_datasets() -> dict[str, str]:
913
logger.info("Fetching registered datasets list from registry")
1014
try:
@@ -40,3 +44,12 @@ def fetch_license_mappings() -> dict[str, str]:
4044
"Failed to fetch license mappings from registry, with error: " + str(e),
4145
)
4246
return {}
47+
48+
49+
def get_license_name_from_url(
50+
url: str, force_refresh: Optional[bool] = False
51+
) -> Optional[str]:
52+
global _license_mappings
53+
if force_refresh or (_license_mappings is None):
54+
_license_mappings = fetch_license_mappings()
55+
return _license_mappings.get(url, None)

tests/test_registry.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from oc4ids_datastore_pipeline.registry import (
77
fetch_license_mappings,
88
fetch_registered_datasets,
9+
get_license_name_from_url,
910
)
1011

1112

@@ -99,3 +100,34 @@ def test_fetch_license_mappings_catches_exception(
99100
result = fetch_license_mappings()
100101

101102
assert result == {}
103+
104+
105+
def test_get_license_name_from_url(mocker: MockerFixture) -> None:
106+
patch_license_mappings = mocker.patch(
107+
"oc4ids_datastore_pipeline.registry.fetch_license_mappings"
108+
)
109+
patch_license_mappings.return_value = {
110+
"https://license_1.com/license": "License 1",
111+
"https://license_2.com/license": "License 2",
112+
}
113+
114+
license_name = get_license_name_from_url(
115+
"https://license_2.com/license", force_refresh=True
116+
)
117+
118+
assert license_name == "License 2"
119+
120+
121+
def test_get_license_name_from_url_not_in_mapping(mocker: MockerFixture) -> None:
122+
patch_license_mappings = mocker.patch(
123+
"oc4ids_datastore_pipeline.registry.fetch_license_mappings"
124+
)
125+
patch_license_mappings.return_value = {
126+
"https://license_1.com/license": "License 1",
127+
}
128+
129+
license_name = get_license_name_from_url(
130+
"https://license_2.com/license", force_refresh=True
131+
)
132+
133+
assert license_name is None

0 commit comments

Comments
 (0)