Merge pull request #92 from AllenNeuralDynamics/release-v0.10.0

jtyoung84 · web-flow · commit 7342be3c8d2e · 2024-09-06T23:40:18.000Z
Release v0.10.0
diff --git a/.github/workflows/publish_dev.yml b/.github/workflows/publish_dev.yml
@@ -1,21 +1,14 @@
-name: Tag and publish
+name: Publish dev
 on:
   push:
     branches:
-      - main
-jobs:
-  tag:
-    uses: AllenNeuralDynamics/aind-github-actions/.github/workflows/tag.yml@main
-    secrets:
-      SERVICE_TOKEN: ${{ secrets.SERVICE_TOKEN }}
+      - dev
 
+jobs:
   publish:
     runs-on: ubuntu-latest
-    needs: tag
     steps:
       - uses: actions/checkout@v3
-      - name: Pull latest changes
-        run: git pull origin main
       - name: Set up Docker Buildx
         id: buildx
         uses: docker/setup-buildx-action@v2
@@ -32,5 +25,4 @@ jobs:
           context: .
           push: true
           tags: |
-            ghcr.io/allenneuraldynamics/aind-data-asset-indexer:${{ needs.tag.outputs.new_version }}
-            ghcr.io/allenneuraldynamics/aind-data-asset-indexer:latest
+            ghcr.io/allenneuraldynamics/aind-data-asset-indexer:dev
diff --git a/.github/workflows/publish_main.yml b/.github/workflows/publish_main.yml
@@ -0,0 +1,43 @@
+name: Tag and publish main
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  tag_and_publish:
+    name: Parse version
+    runs-on: ubuntu-latest
+    outputs:
+      pkg_version: ${{ steps.output_version.outputs.pkg_version }}
+    steps:
+    - uses: actions/checkout@v3
+    - name: Get version from file
+      run: |
+        pkg_name=$(grep -P 'version = \{attr = .*\}' pyproject.toml | grep -oP '\w+.__version__')
+        init_file="./src/${pkg_name//.__version__}/__init__.py"
+        pkg_version=$(grep -Po '[0-9]+\.[0-9]+\.[0-9]+' "$init_file")
+        echo "docker_tag=$pkg_version" >> "$GITHUB_ENV"
+    - name: Create git tag
+      run: |
+        git tag "v${{ env.docker_tag }}"
+    - name: Push git tag
+      run: git push origin "v${{ env.docker_tag }}"
+    - name: Set up Docker Buildx
+      id: buildx
+      uses: docker/setup-buildx-action@v2
+    - name: Login to Github Packages
+      uses: docker/login-action@v2
+      with:
+        registry: ghcr.io
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: Build image and push to GitHub Container Registry
+      uses: docker/build-push-action@v3
+      with:
+        # relative path to the place where source code with Dockerfile is located
+        context: .
+        push: true
+        tags: |
+          ghcr.io/allenneuraldynamics/aind-data-asset-indexer:${{ env.docker_tag }}
+          ghcr.io/allenneuraldynamics/aind-data-asset-indexer:latest
diff --git a/.github/workflows/run_dev_tests.yml b/.github/workflows/run_dev_tests.yml
@@ -1,9 +1,9 @@
-name: Lint and run tests
+name: Run checks in dev
 
 on:
   pull_request:
     branches:
-      - main
+      - dev
 
 jobs:
   ci:
diff --git a/.github/workflows/run_main_tests.yml b/.github/workflows/run_main_tests.yml
@@ -0,0 +1,43 @@
+name: Run checks in main and release
+
+on:
+  pull_request:
+    branches:
+      - '*release*'
+      - main
+
+jobs:
+  ci:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [ '3.8', '3.9', '3.10' ]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: | 
+        python -m pip install -e .[dev]
+    - name: Run linter checks
+      run: flake8 . && interrogate --verbose .
+    - name: Run tests and coverage
+      run: coverage run -m unittest discover && coverage report
+  verify_version:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Check version incremented
+        run: |
+          pkg_name=$(grep -P 'version = \{attr = .*\}' pyproject.toml | grep -oP '\w+.__version__')
+          init_file="./src/${pkg_name//.__version__}/__init__.py"
+          pkg_version=$(grep -Po '[0-9]+\.[0-9]+\.[0-9]+' "$init_file")
+          latest_tag=$(git ls-remote --tags --refs --sort="v:refname" | tail -n1 | sed 's/.*\///')
+          echo "Checking pkg_version v$pkg_version and latest_tag $latest_tag"
+          if [ "$latest_tag" == "v$pkg_version" ]
+          then
+            exit 1
+          fi
+          echo "Versions are different"
diff --git a/src/aind_data_asset_indexer/__init__.py b/src/aind_data_asset_indexer/__init__.py
@@ -1,3 +1,3 @@
 """Package"""
 
-__version__ = "0.9.5"
+__version__ = "0.10.0"
diff --git a/src/aind_data_asset_indexer/aind_bucket_indexer.py b/src/aind_data_asset_indexer/aind_bucket_indexer.py
@@ -266,12 +266,14 @@ def _resolve_schema_information(
                 object_key = create_object_key(
                     prefix=prefix, filename=core_schema_file_name
                 )
-                common_kwargs["core_schema_info_in_root"] = (
-                    get_dict_of_file_info(
-                        s3_client=s3_client,
-                        bucket=self.job_settings.s3_bucket,
-                        keys=[object_key],
-                    ).get(object_key)
+                common_kwargs[
+                    "core_schema_info_in_root"
+                ] = get_dict_of_file_info(
+                    s3_client=s3_client,
+                    bucket=self.job_settings.s3_bucket,
+                    keys=[object_key],
+                ).get(
+                    object_key
                 )
                 self._copy_file_from_root_to_subdir(**common_kwargs)
             # If field is null, a file exists in the root folder, and
diff --git a/src/aind_data_asset_indexer/codeocean_bucket_indexer.py b/src/aind_data_asset_indexer/codeocean_bucket_indexer.py
@@ -6,13 +6,18 @@
 import os
 import sys
 import warnings
-from typing import List
+from datetime import datetime
+from typing import List, Optional, Union
 
 import boto3
 import dask.bag as dask_bag
+import requests
 from aind_codeocean_api.codeocean import CodeOceanClient
+from aind_data_schema.core.metadata import ExternalPlatforms
 from mypy_boto3_s3 import S3Client
 from pymongo import MongoClient
+from pymongo.operations import UpdateOne
+from requests.exceptions import ReadTimeout
 
 from aind_data_asset_indexer.models import CodeOceanIndexBucketJobSettings
 from aind_data_asset_indexer.utils import (
@@ -44,6 +49,184 @@ def __init__(self, job_settings: CodeOceanIndexBucketJobSettings):
         """Class constructor."""
         self.job_settings = job_settings
 
+    def _get_external_data_asset_records(self) -> Optional[List[dict]]:
+        """
+        Retrieves list of code ocean ids and locations for external data
+        assets. The timeout is set to 600 seconds.
+        Returns
+        -------
+        List[dict] | None
+          List items have shape {"id": str, "location": str}. If error occurs,
+          return None.
+        """
+        try:
+            response = requests.get(
+                self.job_settings.temp_codeocean_endpoint,
+                timeout=600,
+            )
+            if response.status_code == 200:
+                return response.json()
+            else:
+                return None
+        except ReadTimeout:
+            logging.error(
+                f"Read timed out at "
+                f"{self.job_settings.temp_codeocean_endpoint}"
+            )
+            return None
+
+    @staticmethod
+    def _map_external_list_to_dict(external_recs: List[dict]) -> dict:
+        """
+        Maps the response received from Code Ocean into a dict. For example,
+        [{"id": "abc", "location": "s3://bucket/prefix},
+        {"id": "def", "location": "s3://bucket/prefix"}]
+        will be mapped to {"s3://bucket/prefix": ["abc", "def"]}
+        Parameters
+        ----------
+        external_recs : List[dict]
+
+        Returns
+        -------
+        dict
+
+        """
+        new_records = dict()
+        for r in external_recs:
+            location = r.get("source")
+            rec_id = r["id"]
+            if location is not None and new_records.get(location) is not None:
+                old_id_set = new_records.get(location)
+                old_id_set.add(rec_id)
+                new_records[location] = old_id_set
+            else:
+                new_records[location] = {rec_id}
+        return new_records
+
+    @staticmethod
+    def _get_co_links_from_record(
+        docdb_record: Union[dict, list]
+    ) -> List[str]:
+        """
+        Small utility to parse the external_links field of the docdb record.
+        Supports the legacy type.
+        Parameters
+        ----------
+        docdb_record : dict | list
+          The legacy type was a list, while the current version is a dict.
+
+        Returns
+        -------
+        List[str]
+
+        """
+        external_links = docdb_record.get("external_links", [])
+
+        # Hopefully, ExternalPlatforms.CODEOCEAN doesn't change
+        if isinstance(external_links, dict):
+            external_links = external_links.get(
+                ExternalPlatforms.CODEOCEAN.value, []
+            )
+        else:
+            external_links = [
+                r.get(ExternalPlatforms.CODEOCEAN.value)
+                for r in external_links
+            ]
+        return external_links
+
+    def _update_external_links_in_docdb(
+        self, docdb_client: MongoClient
+    ) -> None:
+        """
+        This method will:
+        1) Retrieve a list of codeocean data asset ids and locations from CO
+        2) Paginate through the docdb records where the location doesn't match
+        the internal co bucket.
+        3) Add or remove the external_links from the docdb record if needed.
+        Parameters
+        ----------
+        docdb_client : MongoClient
+
+        Returns
+        -------
+        None
+
+        """
+        # Should return a list like [{"id": co_id, "location": "s3://..."},]
+        list_of_co_ids_and_locations = self._get_external_data_asset_records()
+        db = docdb_client[self.job_settings.doc_db_db_name]
+        collection = db[self.job_settings.doc_db_collection_name]
+        if list_of_co_ids_and_locations is not None:
+            co_loc_to_id_map = self._map_external_list_to_dict(
+                list_of_co_ids_and_locations
+            )
+            pages = paginate_docdb(
+                docdb_client=docdb_client,
+                db_name=self.job_settings.doc_db_db_name,
+                collection_name=self.job_settings.doc_db_collection_name,
+                filter_query={
+                    "location": {
+                        "$not": {
+                            "$regex": f"^s3://{self.job_settings.s3_bucket}.*"
+                        }
+                    }
+                },
+                projection={"_id": 1, "location": 1, "external_links": 1},
+                page_size=500,
+            )
+            for page in pages:
+                records_to_update = []
+                for record in page:
+                    location = record.get("location")
+                    external_links = self._get_co_links_from_record(record)
+                    code_ocean_ids = (
+                        None
+                        if location is None
+                        else co_loc_to_id_map.get(location)
+                    )
+                    docdb_rec_id = record["_id"]
+                    if (
+                        external_links is not None
+                        and code_ocean_ids is not None
+                        and code_ocean_ids != set(external_links)
+                    ):
+                        new_external_links = code_ocean_ids
+                    elif external_links is not None and not code_ocean_ids:
+                        logging.info(
+                            f"No code ocean data asset ids found for "
+                            f"{location}. Removing external links from record."
+                        )
+                        new_external_links = dict()
+                    else:
+                        new_external_links = None
+                    if new_external_links is not None:
+                        record_links = {
+                            ExternalPlatforms.CODEOCEAN.value: sorted(
+                                list(new_external_links)
+                            )
+                        }
+                        last_modified = datetime.utcnow().isoformat()
+                        records_to_update.append(
+                            UpdateOne(
+                                filter={"_id": docdb_rec_id},
+                                update={
+                                    "$set": {
+                                        "external_links": record_links,
+                                        "last_modified": last_modified,
+                                    }
+                                },
+                                upsert=False,
+                            )
+                        )
+                if len(records_to_update) > 0:
+                    logging.info(f"Updating {len(records_to_update)} records")
+                    write_response = collection.bulk_write(
+                        requests=records_to_update
+                    )
+                    logging.debug(write_response)
+        else:
+            logging.error("There was an error retrieving external links!")
+
     def _process_codeocean_record(
         self,
         codeocean_record: dict,
@@ -220,6 +403,12 @@ def run_job(self):
             password=self.job_settings.doc_db_password.get_secret_value(),
             authSource="admin",
         )
+        # Use existing client to add external links to fields
+        logging.info("Adding links to records.")
+        self._update_external_links_in_docdb(
+            docdb_client=iterator_docdb_client
+        )
+        logging.info("Finished adding links to records")
         all_docdb_records = dict()
         docdb_pages = paginate_docdb(
             db_name=self.job_settings.doc_db_db_name,
diff --git a/src/aind_data_asset_indexer/models.py b/src/aind_data_asset_indexer/models.py
@@ -122,6 +122,12 @@ class CodeOceanIndexBucketJobSettings(IndexJobSettings):
     doc_db_collection_name: str
     codeocean_domain: str
     codeocean_token: SecretStr
+    temp_codeocean_endpoint: str = Field(
+        description=(
+            "Temp proxy to access code ocean information from their analytics "
+            "databases."
+        )
+    )
 
     @classmethod
     def from_param_store(cls, param_store_name: str):
diff --git a/tests/test_codeocean_bucket_indexer.py b/tests/test_codeocean_bucket_indexer.py
diff --git a/tests/test_models.py b/tests/test_models.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`"""Package"""`
`2`	`2`
`3`		`-__version__ = "0.9.5"`
	`3`	`+__version__ = "0.10.0"`