Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ RUN apt-get update \

WORKDIR /oc4ids_datastore_pipeline

COPY requirements.txt .
COPY requirements_dev.txt .

RUN pip install -r requirements.txt
RUN pip install -r requirements_dev.txt

COPY . .

Expand Down
32 changes: 32 additions & 0 deletions migrations/versions/cde761a59c2f_add_portals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""add_portals
Revision ID: cde761a59c2f
Revises: b21b5de6ee2d
Create Date: 2025-07-08 07:51:48.954914
"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'cde761a59c2f'
down_revision: Union[str, None] = 'b21b5de6ee2d'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('dataset', sa.Column('portal_url', sa.String(), nullable=True))
op.add_column('dataset', sa.Column('portal_title', sa.String(), nullable=True))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('dataset', 'portal_title')
op.drop_column('dataset', 'portal_url')
# ### end Alembic commands ###
2 changes: 2 additions & 0 deletions oc4ids_datastore_pipeline/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class Dataset(Base):
csv_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
xlsx_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
updated_at: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True))
portal_url: Mapped[Optional[str]] = mapped_column(String, nullable=True)
portal_title: Mapped[Optional[str]] = mapped_column(String, nullable=True)


def get_engine() -> Engine:
Expand Down
6 changes: 6 additions & 0 deletions oc4ids_datastore_pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ def save_dataset_metadata(
json_url: Optional[str],
csv_url: Optional[str],
xlsx_url: Optional[str],
portal_title: Optional[str],
portal_url: Optional[str],
) -> None:
logger.info(f"Saving metadata for dataset {dataset_id}")
try:
Expand All @@ -127,6 +129,8 @@ def save_dataset_metadata(
license_url=license_url,
license_title=license_title,
license_title_short=license_title_short,
portal_title=portal_title,
portal_url=portal_url,
json_url=json_url,
csv_url=csv_url,
xlsx_url=xlsx_url,
Expand Down Expand Up @@ -157,6 +161,8 @@ def process_dataset(dataset_id: str, registry_metadata: dict[str, str]) -> None:
json_url=json_public_url,
csv_url=csv_public_url,
xlsx_url=xlsx_public_url,
portal_title=registry_metadata["portal_title"],
portal_url=registry_metadata["portal_url"],
)
logger.info(f"Processed dataset {dataset_id}")

Expand Down
16 changes: 10 additions & 6 deletions oc4ids_datastore_pipeline/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,17 @@ def fetch_registered_datasets() -> dict[str, dict[str, str]]:
r = requests.get(url)
r.raise_for_status()
json_data = r.json()
registered_datasets = {
key: {
"source_url": value["fields"]["url"]["value"],
"country": value["fields"]["country"]["value"],
registered_datasets = {}
for key, value in json_data["records"].items():
r_data = requests.get(value["api_url"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure how big json_data["records"] is but that could be a very large number of requests all at once? Has needing some kind of control on that been considered?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's only ~10 records and isn't expected to grow fast, and we have control over the API.

If it gets bigger the thing to do is look into methods where we can get all the data with one API call - currently the API does allow that (there's a sqlite with all the data in it) but it would be more changes, and this will do for now.

r_data.raise_for_status()
r_data_json = r_data.json()
registered_datasets[key] = {
"source_url": r_data_json["fields"]["url"]["value"],
"country": r_data_json["fields"]["country"]["value"],
"portal_title": r_data_json["fields"]["portal_title"]["value"],
"portal_url": r_data_json["fields"]["portal_url"]["value"],
}
for (key, value) in json_data["records"].items()
}
registered_datasets_count = len(registered_datasets)
logger.info(f"Fetched URLs for {registered_datasets_count} datasets")
except Exception as e:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "flit_core.buildapi"
[project]
name = "oc4ids-datastore-pipeline"
description = "OC4IDS Datastore Pipeline"
version = "0.5.0"
version = "0.6.0"
readme = "README.md"
dependencies = [
"alembic",
Expand Down
34 changes: 25 additions & 9 deletions tests/test_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,39 @@

def test_fetch_registered_datasets(mocker: MockerFixture) -> None:
mock_response = MagicMock()
mock_response.json.return_value = {
"records": {
"test_dataset": {
"fields": {
"url": {"value": "https://test_dataset.json"},
"country": {"value": "ab"},
mock_response.json.side_effect = [
{
"records": {
"test_dataset": {
"api_url": "http://www.example.com",
"fields": {
"url": {"value": "https://test_dataset.json"},
"country": {"value": "ab"},
},
}
}
}
}
},
{
"fields": {
"url": {"value": "https://test_dataset.json"},
"country": {"value": "ab"},
"portal_title": {"value": "Our Portal"},
"portal_url": {"value": "https://our.portal"},
}
},
]
patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
patch_get.return_value = mock_response

result = fetch_registered_datasets()

assert result == {
"test_dataset": {"source_url": "https://test_dataset.json", "country": "ab"}
"test_dataset": {
"source_url": "https://test_dataset.json",
"country": "ab",
"portal_title": "Our Portal",
"portal_url": "https://our.portal",
}
}


Expand Down