From f4c136a4fdc93d3575b893495fdbe147fa879554 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 11 Feb 2025 12:13:56 +0200 Subject: [PATCH 01/11] fix: move python-dotenv to main dependencies --- pyproject.toml | 2 +- requirements.txt | 181 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 requirements.txt diff --git a/pyproject.toml b/pyproject.toml index d9661cf..1413f3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "flattentool", "libcoveoc4ids", "psycopg2", + "python-dotenv", "requests", "sqlalchemy", ] @@ -26,7 +27,6 @@ dev = [ "mypy", "pytest", "pytest-mock", - "python-dotenv", "types-boto3", "types-requests", ] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ed44670 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,181 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --output-file=requirements.txt pyproject.toml +# +alembic==1.14.1 + # via oc4ids-datastore-pipeline (pyproject.toml) +attrs==25.1.0 + # via + # cattrs + # jsonschema + # referencing + # requests-cache +backports-datetime-fromisoformat==2.0.3 + # via flattentool +boto3==1.36.17 + # via oc4ids-datastore-pipeline (pyproject.toml) +botocore==1.36.17 + # via + # boto3 + # s3transfer +btrees==6.1 + # via zodb +cattrs==24.1.2 + # via requests-cache +certifi==2025.1.31 + # via requests +cffi==1.17.1 + # via persistent +charset-normalizer==3.4.1 + # via requests +click==8.1.8 + # via + # libcoveoc4ids + # libcoveocds +defusedxml==0.7.1 + # via odfpy +et-xmlfile==2.0.0 + # via openpyxl +flattentool==0.27.0 + # via + # libcove + # oc4ids-datastore-pipeline (pyproject.toml) +idna==3.10 + # via requests +ijson==3.3.0 + # via flattentool +jmespath==1.0.1 + # via + # boto3 + # botocore +json-merge-patch==0.2 + # via ocdsextensionregistry +jsonref==1.1.0 + # via + # flattentool + # libcove + # libcoveocds + # ocdsextensionregistry +jsonschema==4.23.0 + # via + # libcove + # libcoveocds +jsonschema-specifications==2024.10.1 + # via jsonschema +libcove==0.32.1 + # via + # libcoveoc4ids + # libcoveocds +libcoveoc4ids==0.9.0 + # via oc4ids-datastore-pipeline (pyproject.toml) +libcoveocds==0.16.4 + # via libcoveoc4ids +lxml==5.3.1 + # via flattentool +mako==1.3.9 + # via alembic +markupsafe==3.0.2 + # via mako +ocdsextensionregistry==0.6.9 + # via libcoveocds +odfpy==1.4.1 + # via flattentool +openpyxl==3.1.5 + # via flattentool +persistent==6.1 + # via + # btrees + # zodb +platformdirs==4.3.6 + # via requests-cache +psycopg2==2.9.10 + # via oc4ids-datastore-pipeline (pyproject.toml) +pycparser==2.22 + # via cffi +python-dateutil==2.9.0.post0 + # via botocore +python-dotenv==1.0.1 + # via oc4ids-datastore-pipeline (pyproject.toml) +pytz==2025.1 + # via flattentool +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications + # libcove + # libcoveocds +requests==2.32.3 + # via + # libcove + # libcoveocds + # oc4ids-datastore-pipeline (pyproject.toml) + # ocdsextensionregistry + # requests-cache +requests-cache==1.2.1 + # via ocdsextensionregistry +rfc3339-validator==0.1.4 + # via libcove +rfc3987==1.3.8 + # via libcove +rpds-py==0.22.3 + # via + # jsonschema + # referencing +s3transfer==0.11.2 + # via boto3 +schema==0.7.7 + # via flattentool +six==1.17.0 + # via + # python-dateutil + # rfc3339-validator + # url-normalize +sqlalchemy==2.0.38 + # via + # alembic + # oc4ids-datastore-pipeline (pyproject.toml) +transaction==5.0 + # via zodb +typing-extensions==4.12.2 + # via + # alembic + # referencing + # sqlalchemy +url-normalize==1.4.3 + # via requests-cache +urllib3==2.3.0 + # via + # botocore + # requests + # requests-cache +xmltodict==0.14.2 + # via flattentool +zc-lockfile==3.0.post1 + # via zodb +zc-zlibstorage==1.2.0 + # via flattentool +zconfig==4.2 + # via zodb +zodb==6.0 + # via + # flattentool + # zc-zlibstorage +zodbpickle==4.1.1 + # via zodb +zope-deferredimport==5.0 + # via persistent +zope-interface==7.2 + # via + # btrees + # persistent + # transaction + # zc-zlibstorage + # zodb + # zope-proxy +zope-proxy==6.1 + # via zope-deferredimport + +# The following packages are considered to be unsafe in a requirements file: +# setuptools From 77c9c398c23ebf0434ec3bc901c372dd5d960af7 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 11 Feb 2025 12:14:15 +0200 Subject: [PATCH 02/11] build: add Dockerfile --- Dockerfile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..65ff616 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12-slim + +RUN apt-get update \ + && apt-get install -y libpq-dev gcc + +WORKDIR /oc4ids_datastore_pipeline + +COPY requirements.txt . + +RUN pip install -r requirements.txt + +COPY . . + +RUN pip install . + +ENTRYPOINT ["sh", "-c", "alembic upgrade head && oc4ids-datastore-pipeline"] From 1b25484606a6ba47de2671404759a61fdf3145cc Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 11 Feb 2025 12:16:41 +0200 Subject: [PATCH 03/11] ci: add docker image build check to CI --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 019e378..00c890f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,3 +23,5 @@ jobs: run: mypy oc4ids_datastore_pipeline/ tests/ - name: Run tests run: pytest + - name: Build docker image + run: docker build -t oc4ids-datastore-pipeline . From c30ac6508a537171b8b1e7ccc510f2082e6ae504 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 11 Feb 2025 12:31:55 +0200 Subject: [PATCH 04/11] ci: build and push image to ghcr --- .github/workflows/build-and-push-image.yml | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/build-and-push-image.yml diff --git a/.github/workflows/build-and-push-image.yml b/.github/workflows/build-and-push-image.yml new file mode 100644 index 0000000..69e4f5e --- /dev/null +++ b/.github/workflows/build-and-push-image.yml @@ -0,0 +1,23 @@ +name: CI +on: [push] + +jobs: + build-and-push-image: + runs-on: ubuntu-latest + env: + IMAGE_NAME: "oc4ids-datastore-pipeline" + steps: + - uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build and push image + run: | + IMAGE_ID=ghcr.io/${{ github.repository_owner }}/$IMAGE_NAME + IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]') + echo $IMAGE_ID + docker build . -t ${IMAGE_ID} + docker push ${IMAGE_ID}:latest From 9b5957b229a2517c89117c5c0ba452eb05e6c249 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 11 Feb 2025 16:45:52 +0200 Subject: [PATCH 05/11] fix: make json_url nullable --- .../3499656b84e7_allow_nullable_json_url.py | 31 +++++++++++++++++++ oc4ids_datastore_pipeline/database.py | 2 +- 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 migrations/versions/3499656b84e7_allow_nullable_json_url.py diff --git a/migrations/versions/3499656b84e7_allow_nullable_json_url.py b/migrations/versions/3499656b84e7_allow_nullable_json_url.py new file mode 100644 index 0000000..ae42e02 --- /dev/null +++ b/migrations/versions/3499656b84e7_allow_nullable_json_url.py @@ -0,0 +1,31 @@ +"""allow nullable json_url + +Revision ID: 3499656b84e7 +Revises: 084c39bf418e +Create Date: 2025-02-11 16:44:30.550413 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "3499656b84e7" +down_revision: Union[str, None] = "084c39bf418e" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column("dataset", "json_url", existing_type=sa.VARCHAR(), nullable=True) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column("dataset", "json_url", existing_type=sa.VARCHAR(), nullable=False) + # ### end Alembic commands ### diff --git a/oc4ids_datastore_pipeline/database.py b/oc4ids_datastore_pipeline/database.py index f699b56..ddafb8f 100644 --- a/oc4ids_datastore_pipeline/database.py +++ b/oc4ids_datastore_pipeline/database.py @@ -30,7 +30,7 @@ class Dataset(Base): publisher_name: Mapped[str] = mapped_column(String) license_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) license_name: Mapped[Optional[str]] = mapped_column(String, nullable=True) - json_url: Mapped[str] = mapped_column(String) + json_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) csv_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) xlsx_url: Mapped[Optional[str]] = mapped_column(String, nullable=True) updated_at: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True)) From d6b8d4f6226b50e7e68cca24747fd1825bea1acb Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Tue, 11 Feb 2025 18:02:01 +0200 Subject: [PATCH 06/11] feat: add log line to signal end of pipeline --- oc4ids_datastore_pipeline/pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py index f21b03f..efbe5af 100644 --- a/oc4ids_datastore_pipeline/pipeline.py +++ b/oc4ids_datastore_pipeline/pipeline.py @@ -145,6 +145,7 @@ def process_registry() -> None: process_deleted_datasets(registered_datasets) for name, url in registered_datasets.items(): process_dataset(name, url) + logger.info("Finished processing all datasets") def run() -> None: From 72c6e0dc5c4cfa65a39a0c9d9a3789d67e0f113e Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Wed, 12 Feb 2025 09:17:36 +0200 Subject: [PATCH 07/11] ci: push image for tags only --- .github/workflows/build-and-push-image.yml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-and-push-image.yml b/.github/workflows/build-and-push-image.yml index 69e4f5e..620a811 100644 --- a/.github/workflows/build-and-push-image.yml +++ b/.github/workflows/build-and-push-image.yml @@ -1,5 +1,9 @@ -name: CI -on: [push] +name: Build and push image + +on: + push: + tags: + - "v*" jobs: build-and-push-image: @@ -14,10 +18,16 @@ jobs: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Extract version + run: | + TAG=${GITHUB_REF#refs/*/} + echo "VERSION=${TAG#v}" >> $GITHUB_ENV + - name: Print version + run: echo $VERSION - name: Build and push image run: | IMAGE_ID=ghcr.io/${{ github.repository_owner }}/$IMAGE_NAME IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]') echo $IMAGE_ID - docker build . -t ${IMAGE_ID} - docker push ${IMAGE_ID}:latest + docker build . -t ${IMAGE_ID}:${VERSION} -t ${IMAGE_ID}:latest + docker push --all-tags ${IMAGE_ID} From ce8b63165489190cc5856ad27798345318443379 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Wed, 12 Feb 2025 09:52:38 +0200 Subject: [PATCH 08/11] ci: create tag on push to live --- .github/workflows/create-tag.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/workflows/create-tag.yml diff --git a/.github/workflows/create-tag.yml b/.github/workflows/create-tag.yml new file mode 100644 index 0000000..963f9cc --- /dev/null +++ b/.github/workflows/create-tag.yml @@ -0,0 +1,27 @@ +name: Create tag + +on: + push: + branches: + - live + +jobs: + create-tag: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install local package + run: pip install . + - name: Extract version + run: | + VERSION=$(python -c "import importlib.metadata; print(importlib.metadata.version('oc4ids-datastore-pipeline'))") + echo "VERSION=$VERSION" >> $GITHUB_ENV + - name: Print version + run: echo $VERSION + - name: Create tag + run: | + git tag "v${VERSION}" + git push origin "v${VERSION}" From c952ac8dbf6a805c79e037b1ffd92d708d60b86b Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Wed, 12 Feb 2025 10:06:27 +0200 Subject: [PATCH 09/11] docs: include docs about releases in readme --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 25e3ab3..227be04 100644 --- a/README.md +++ b/README.md @@ -71,3 +71,8 @@ pytest ``` alembic revision --autogenerate -m "" ``` + +## Releasing + +On merge to `live`, a tag will be created using the version in `pyproject.toml`, +which will then trigger a docker image build. From 706b01fbf48b1352246b27987ec3c12e7a69abb4 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Wed, 12 Feb 2025 11:12:32 +0200 Subject: [PATCH 10/11] ci: create github release on merge to live --- .github/workflows/build-and-push-image.yml | 5 ++--- .../workflows/{create-tag.yml => create-release.yml} | 12 ++++++++++-- README.md | 3 +-- 3 files changed, 13 insertions(+), 7 deletions(-) rename .github/workflows/{create-tag.yml => create-release.yml} (69%) diff --git a/.github/workflows/build-and-push-image.yml b/.github/workflows/build-and-push-image.yml index 620a811..87dd7d8 100644 --- a/.github/workflows/build-and-push-image.yml +++ b/.github/workflows/build-and-push-image.yml @@ -1,9 +1,8 @@ name: Build and push image on: - push: - tags: - - "v*" + release: + types: [created] jobs: build-and-push-image: diff --git a/.github/workflows/create-tag.yml b/.github/workflows/create-release.yml similarity index 69% rename from .github/workflows/create-tag.yml rename to .github/workflows/create-release.yml index 963f9cc..5f30234 100644 --- a/.github/workflows/create-tag.yml +++ b/.github/workflows/create-release.yml @@ -1,4 +1,4 @@ -name: Create tag +name: Create GitHub release on: push: @@ -6,8 +6,10 @@ on: - live jobs: - create-tag: + create-release: runs-on: ubuntu-latest + env: + GH_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -25,3 +27,9 @@ jobs: run: | git tag "v${VERSION}" git push origin "v${VERSION}" + - name: Create release + run: | + gh release create "v${VERSION}" \ + --repo="${GITHUB_REPOSITORY}" \ + --title="v${VERSION}" \ + --generate-notes diff --git a/README.md b/README.md index 227be04..f140520 100644 --- a/README.md +++ b/README.md @@ -74,5 +74,4 @@ alembic revision --autogenerate -m "" ## Releasing -On merge to `live`, a tag will be created using the version in `pyproject.toml`, -which will then trigger a docker image build. +On merge to `live`, a git tag and GitHub release will be created using the version in `pyproject.toml`, which will then trigger a docker image to be built and pushed, tagged with the version and `latest`. From 40f98e758175da28faf50730c90af1d7b59c9e87 Mon Sep 17 00:00:00 2001 From: Tilly Woodfield <22456167+tillywoodfield@users.noreply.github.com> Date: Wed, 12 Feb 2025 14:31:25 +0200 Subject: [PATCH 11/11] ci: remove automatic release creation --- .github/workflows/create-release.yml | 35 ---------------------------- README.md | 2 +- 2 files changed, 1 insertion(+), 36 deletions(-) delete mode 100644 .github/workflows/create-release.yml diff --git a/.github/workflows/create-release.yml b/.github/workflows/create-release.yml deleted file mode 100644 index 5f30234..0000000 --- a/.github/workflows/create-release.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: Create GitHub release - -on: - push: - branches: - - live - -jobs: - create-release: - runs-on: ubuntu-latest - env: - GH_TOKEN: ${{ github.token }} - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - name: Install local package - run: pip install . - - name: Extract version - run: | - VERSION=$(python -c "import importlib.metadata; print(importlib.metadata.version('oc4ids-datastore-pipeline'))") - echo "VERSION=$VERSION" >> $GITHUB_ENV - - name: Print version - run: echo $VERSION - - name: Create tag - run: | - git tag "v${VERSION}" - git push origin "v${VERSION}" - - name: Create release - run: | - gh release create "v${VERSION}" \ - --repo="${GITHUB_REPOSITORY}" \ - --title="v${VERSION}" \ - --generate-notes diff --git a/README.md b/README.md index f140520..6e6c35e 100644 --- a/README.md +++ b/README.md @@ -74,4 +74,4 @@ alembic revision --autogenerate -m "" ## Releasing -On merge to `live`, a git tag and GitHub release will be created using the version in `pyproject.toml`, which will then trigger a docker image to be built and pushed, tagged with the version and `latest`. +To publish a new version, raise a PR to `main` updating the version in `pyproject.toml`. Once merged, create a git tag and GitHub release for the new version, with naming `vX.Y.Z`. This will trigger a docker image to to be built and pushed, tagged with the version and `latest`.