diff --git a/Dockerfile.dev b/Dockerfile.dev index 8247e61..1fc6517 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -5,9 +5,9 @@ RUN apt-get update \ WORKDIR /oc4ids_datastore_pipeline -COPY requirements.txt . +COPY requirements_dev.txt . -RUN pip install -r requirements.txt +RUN pip install -r requirements_dev.txt COPY . . diff --git a/migrations/versions/cde761a59c2f_add_portals.py b/migrations/versions/cde761a59c2f_add_portals.py new file mode 100644 index 0000000..92d6301 --- /dev/null +++ b/migrations/versions/cde761a59c2f_add_portals.py @@ -0,0 +1,32 @@ +"""add_portals + +Revision ID: cde761a59c2f +Revises: b21b5de6ee2d +Create Date: 2025-07-08 07:51:48.954914 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'cde761a59c2f' +down_revision: Union[str, None] = 'b21b5de6ee2d' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('dataset', sa.Column('portal_url', sa.String(), nullable=True)) + op.add_column('dataset', sa.Column('portal_title', sa.String(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('dataset', 'portal_title') + op.drop_column('dataset', 'portal_url') + # ### end Alembic commands ### diff --git a/oc4ids_datastore_pipeline/database.py b/oc4ids_datastore_pipeline/database.py index 3a1543f..757d60f 100644 --- a/oc4ids_datastore_pipeline/database.py +++ b/oc4ids_datastore_pipeline/database.py @@ -36,6 +36,8 @@ class Dataset(Base): csv_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) xlsx_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) updated_at: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True)) + portal_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) + portal_title: Mapped[Optional[str]] = mapped_column(String, nullable=True) def get_engine() -> Engine: diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index abcc6ff..f38eda7 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -111,6 +111,8 @@ def save_dataset_metadata( json_url: Optional[str], csv_url: Optional[str], xlsx_url: Optional[str], + portal_title: Optional[str], + portal_url: Optional[str], ) -> None: logger.info(f"Saving metadata for dataset {dataset_id}") try: @@ -127,6 +129,8 @@ def save_dataset_metadata( license_url=license_url, license_title=license_title, license_title_short=license_title_short, + portal_title=portal_title, + portal_url=portal_url, json_url=json_url, csv_url=csv_url, xlsx_url=xlsx_url, @@ -157,6 +161,8 @@ def process_dataset(dataset_id: str, registry_metadata: dict[str, str]) -> None: json_url=json_public_url, csv_url=csv_public_url, xlsx_url=xlsx_public_url, + portal_title=registry_metadata["portal_title"], + portal_url=registry_metadata["portal_url"], ) logger.info(f"Processed dataset {dataset_id}") diff --git a/oc4ids_datastore_pipeline/registry.py b/oc4ids_datastore_pipeline/registry.py index 0890c5d..90059da 100644 --- a/oc4ids_datastore_pipeline/registry.py +++ b/oc4ids_datastore_pipeline/registry.py @@ -16,13 +16,17 @@ def fetch_registered_datasets() -> dict[str, dict[str, str]]: r = requests.get(url) r.raise_for_status() json_data = r.json() - registered_datasets = { - key: { - "source_url": value["fields"]["url"]["value"], - "country": value["fields"]["country"]["value"], + registered_datasets = {} + for key, value in json_data["records"].items(): + r_data = requests.get(value["api_url"]) + r_data.raise_for_status() + r_data_json = r_data.json() + registered_datasets[key] = { + "source_url": r_data_json["fields"]["url"]["value"], + "country": r_data_json["fields"]["country"]["value"], + "portal_title": r_data_json["fields"]["portal_title"]["value"], + "portal_url": r_data_json["fields"]["portal_url"]["value"], } - for (key, value) in json_data["records"].items() - } registered_datasets_count = len(registered_datasets) logger.info(f"Fetched URLs for {registered_datasets_count} datasets") except Exception as e: diff --git a/pyproject.toml b/pyproject.toml index 864812e..7b2cd9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "flit_core.buildapi" [project] name = "oc4ids-datastore-pipeline" description = "OC4IDS Datastore Pipeline" -version = "0.5.0" +version = "0.6.0" readme = "README.md" dependencies = [ "alembic", diff --git a/tests/test_registry.py b/tests/test_registry.py index d0079ce..b4e2840 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -12,23 +12,39 @@ def test_fetch_registered_datasets(mocker: MockerFixture) -> None: mock_response = MagicMock() - mock_response.json.return_value = { - "records": { - "test_dataset": { - "fields": { - "url": {"value": "https://test_dataset.json"}, - "country": {"value": "ab"}, + mock_response.json.side_effect = [ + { + "records": { + "test_dataset": { + "api_url": "http://www.example.com", + "fields": { + "url": {"value": "https://test_dataset.json"}, + "country": {"value": "ab"}, + }, } } - } - } + }, + { + "fields": { + "url": {"value": "https://test_dataset.json"}, + "country": {"value": "ab"}, + "portal_title": {"value": "Our Portal"}, + "portal_url": {"value": "https://our.portal"}, + } + }, + ] patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") patch_get.return_value = mock_response result = fetch_registered_datasets() assert result == { - "test_dataset": {"source_url": "https://test_dataset.json", "country": "ab"} + "test_dataset": { + "source_url": "https://test_dataset.json", + "country": "ab", + "portal_title": "Our Portal", + "portal_url": "https://our.portal", + } }