From 991b1150c3dfa0c61ca9c627946c277478e71965 Mon Sep 17 00:00:00 2001 From: James B Date: Tue, 8 Jul 2025 09:00:07 +0100 Subject: [PATCH 1/4] refactor: oc4ids_datastore_pipeline/registry.py: Change to getting individual data records This means we can get all the dataset fields datatig has, previously we could only get fields in list_fields in the datatig config --- oc4ids_datastore_pipeline/registry.py | 14 ++++++++------ tests/test_registry.py | 25 +++++++++++++++++-------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/oc4ids_datastore_pipeline/registry.py b/oc4ids_datastore_pipeline/registry.py index 0890c5d..47b1a69 100644 --- a/oc4ids_datastore_pipeline/registry.py +++ b/oc4ids_datastore_pipeline/registry.py @@ -16,13 +16,15 @@ def fetch_registered_datasets() -> dict[str, dict[str, str]]: r = requests.get(url) r.raise_for_status() json_data = r.json() - registered_datasets = { - key: { - "source_url": value["fields"]["url"]["value"], - "country": value["fields"]["country"]["value"], + registered_datasets = {} + for key, value in json_data["records"].items(): + r_data = requests.get(value["api_url"]) + r_data.raise_for_status() + r_data_json = r_data.json() + registered_datasets[key] = { + "source_url": r_data_json["fields"]["url"]["value"], + "country": r_data_json["fields"]["country"]["value"], } - for (key, value) in json_data["records"].items() - } registered_datasets_count = len(registered_datasets) logger.info(f"Fetched URLs for {registered_datasets_count} datasets") except Exception as e: diff --git a/tests/test_registry.py b/tests/test_registry.py index d0079ce..cf21b63 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -12,16 +12,25 @@ def test_fetch_registered_datasets(mocker: MockerFixture) -> None: mock_response = MagicMock() - mock_response.json.return_value = { - "records": { - "test_dataset": { - "fields": { - "url": {"value": "https://test_dataset.json"}, - "country": {"value": "ab"}, + mock_response.json.side_effect = [ + { + "records": { + "test_dataset": { + "api_url": "http://www.example.com", + "fields": { + "url": {"value": "https://test_dataset.json"}, + "country": {"value": "ab"}, + }, } } - } - } + }, + { + "fields": { + "url": {"value": "https://test_dataset.json"}, + "country": {"value": "ab"}, + } + }, + ] patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get") patch_get.return_value = mock_response From 1a2372eb6dfcca7bbd1f4f211806fc0a705f695c Mon Sep 17 00:00:00 2001 From: James B Date: Tue, 8 Jul 2025 09:26:50 +0100 Subject: [PATCH 2/4] build: Install dev python dependencies in dev Docker container --- Dockerfile.dev | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.dev b/Dockerfile.dev index 8247e61..1fc6517 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -5,9 +5,9 @@ RUN apt-get update \ WORKDIR /oc4ids_datastore_pipeline -COPY requirements.txt . +COPY requirements_dev.txt . -RUN pip install -r requirements.txt +RUN pip install -r requirements_dev.txt COPY . . From c1010b269a4331d70530d13f457d80c842c369df Mon Sep 17 00:00:00 2001 From: James B Date: Tue, 8 Jul 2025 09:27:22 +0100 Subject: [PATCH 3/4] feat: Store portal title and URL https://github.com/OpenDataServices/oc4ids-registry/pull/20 --- .../versions/cde761a59c2f_add_portals.py | 32 +++++++++++++++++++ oc4ids_datastore_pipeline/database.py | 2 ++ oc4ids_datastore_pipeline/pipeline.py | 6 ++++ oc4ids_datastore_pipeline/registry.py | 2 ++ tests/test_registry.py | 9 +++++- 5 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 migrations/versions/cde761a59c2f_add_portals.py diff --git a/migrations/versions/cde761a59c2f_add_portals.py b/migrations/versions/cde761a59c2f_add_portals.py new file mode 100644 index 0000000..92d6301 --- /dev/null +++ b/migrations/versions/cde761a59c2f_add_portals.py @@ -0,0 +1,32 @@ +"""add_portals + +Revision ID: cde761a59c2f +Revises: b21b5de6ee2d +Create Date: 2025-07-08 07:51:48.954914 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'cde761a59c2f' +down_revision: Union[str, None] = 'b21b5de6ee2d' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('dataset', sa.Column('portal_url', sa.String(), nullable=True)) + op.add_column('dataset', sa.Column('portal_title', sa.String(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('dataset', 'portal_title') + op.drop_column('dataset', 'portal_url') + # ### end Alembic commands ### diff --git a/oc4ids_datastore_pipeline/database.py b/oc4ids_datastore_pipeline/database.py index 3a1543f..757d60f 100644 --- a/oc4ids_datastore_pipeline/database.py +++ b/oc4ids_datastore_pipeline/database.py @@ -36,6 +36,8 @@ class Dataset(Base): csv_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) xlsx_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) updated_at: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True)) + portal_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) + portal_title: Mapped[Optional[str]] = mapped_column(String, nullable=True) def get_engine() -> Engine: diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index abcc6ff..f38eda7 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -111,6 +111,8 @@ def save_dataset_metadata( json_url: Optional[str], csv_url: Optional[str], xlsx_url: Optional[str], + portal_title: Optional[str], + portal_url: Optional[str], ) -> None: logger.info(f"Saving metadata for dataset {dataset_id}") try: @@ -127,6 +129,8 @@ def save_dataset_metadata( license_url=license_url, license_title=license_title, license_title_short=license_title_short, + portal_title=portal_title, + portal_url=portal_url, json_url=json_url, csv_url=csv_url, xlsx_url=xlsx_url, @@ -157,6 +161,8 @@ def process_dataset(dataset_id: str, registry_metadata: dict[str, str]) -> None: json_url=json_public_url, csv_url=csv_public_url, xlsx_url=xlsx_public_url, + portal_title=registry_metadata["portal_title"], + portal_url=registry_metadata["portal_url"], ) logger.info(f"Processed dataset {dataset_id}") diff --git a/oc4ids_datastore_pipeline/registry.py b/oc4ids_datastore_pipeline/registry.py index 47b1a69..90059da 100644 --- a/oc4ids_datastore_pipeline/registry.py +++ b/oc4ids_datastore_pipeline/registry.py @@ -24,6 +24,8 @@ def fetch_registered_datasets() -> dict[str, dict[str, str]]: registered_datasets[key] = { "source_url": r_data_json["fields"]["url"]["value"], "country": r_data_json["fields"]["country"]["value"], + "portal_title": r_data_json["fields"]["portal_title"]["value"], + "portal_url": r_data_json["fields"]["portal_url"]["value"], } registered_datasets_count = len(registered_datasets) logger.info(f"Fetched URLs for {registered_datasets_count} datasets") diff --git a/tests/test_registry.py b/tests/test_registry.py index cf21b63..b4e2840 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -28,6 +28,8 @@ def test_fetch_registered_datasets(mocker: MockerFixture) -> None: "fields": { "url": {"value": "https://test_dataset.json"}, "country": {"value": "ab"}, + "portal_title": {"value": "Our Portal"}, + "portal_url": {"value": "https://our.portal"}, } }, ] @@ -37,7 +39,12 @@ def test_fetch_registered_datasets(mocker: MockerFixture) -> None: result = fetch_registered_datasets() assert result == { - "test_dataset": {"source_url": "https://test_dataset.json", "country": "ab"} + "test_dataset": { + "source_url": "https://test_dataset.json", + "country": "ab", + "portal_title": "Our Portal", + "portal_url": "https://our.portal", + } } From d891609745e9219ad939ae822deaee8c39238085 Mon Sep 17 00:00:00 2001 From: James B Date: Thu, 10 Jul 2025 08:50:44 +0100 Subject: [PATCH 4/4] feat: bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 864812e..7b2cd9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "flit_core.buildapi" [project] name = "oc4ids-datastore-pipeline" description = "OC4IDS Datastore Pipeline" -version = "0.5.0" +version = "0.6.0" readme = "README.md" dependencies = [ "alembic",