Skip to content

Commit 9570f4e

Browse files
authored
Merge pull request #115 from AllenNeuralDynamics/release-v0.14.0
Release v0.14.0
2 parents 8dc615c + 77c1c2e commit 9570f4e

File tree

15 files changed

+209
-187
lines changed

15 files changed

+209
-187
lines changed

.github/workflows/publish_dev.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ jobs:
88
publish:
99
runs-on: ubuntu-latest
1010
steps:
11-
- uses: actions/checkout@v3
11+
- uses: actions/checkout@v4
1212
- name: Set up Docker Buildx
1313
id: buildx
1414
uses: docker/setup-buildx-action@v2

.github/workflows/publish_main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
outputs:
1212
pkg_version: ${{ steps.output_version.outputs.pkg_version }}
1313
steps:
14-
- uses: actions/checkout@v3
14+
- uses: actions/checkout@v4
1515
- name: Get version from file
1616
run: |
1717
pkg_name=$(grep -P 'version = \{attr = .*\}' pyproject.toml | grep -oP '\w+.__version__')

.github/workflows/run_dev_tests.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ jobs:
1010
runs-on: ubuntu-latest
1111
strategy:
1212
matrix:
13-
python-version: [ '3.8', '3.9', '3.10' ]
13+
python-version: [ '3.9', '3.10', '3.11' ]
1414
steps:
15-
- uses: actions/checkout@v3
15+
- uses: actions/checkout@v4
1616
- name: Set up Python ${{ matrix.python-version }}
17-
uses: actions/setup-python@v3
17+
uses: actions/setup-python@v5
1818
with:
1919
python-version: ${{ matrix.python-version }}
2020
- name: Install dependencies

.github/workflows/run_main_tests.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ jobs:
1111
runs-on: ubuntu-latest
1212
strategy:
1313
matrix:
14-
python-version: [ '3.8', '3.9', '3.10' ]
14+
python-version: [ '3.9', '3.10', '3.11' ]
1515
steps:
16-
- uses: actions/checkout@v3
16+
- uses: actions/checkout@v4
1717
- name: Set up Python ${{ matrix.python-version }}
18-
uses: actions/setup-python@v3
18+
uses: actions/setup-python@v5
1919
with:
2020
python-version: ${{ matrix.python-version }}
2121
- name: Install dependencies
@@ -28,7 +28,7 @@ jobs:
2828
verify_version:
2929
runs-on: ubuntu-latest
3030
steps:
31-
- uses: actions/checkout@v3
31+
- uses: actions/checkout@v4
3232
- name: Check version incremented
3333
run: |
3434
pkg_name=$(grep -P 'version = \{attr = .*\}' pyproject.toml | grep -oP '\w+.__version__')

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ kept in sync:
1212
1. **S3 buckets** store raw metadata files, including the ``metadata.nd.json``.
1313
2. A **document database (DocDB)** contains unstructured json
1414
documents describing the ``metadata.nd.json`` for a data asset.
15-
3. **Code Ocean**: data assets are mounted as CodeOcean data asssets.
15+
3. **Code Ocean**: data assets are mounted as CodeOcean data assets.
1616
Processed results are also stored in an internal Code Ocean bucket.
1717

1818
We have automated jobs to keep changes in DocDB and S3 in sync.

docs/source/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Configuration file for the Sphinx documentation builder."""
2+
23
#
34
# For the full list of built-in configuration values, see the documentation:
45
# https://www.sphinx-doc.org/en/master/usage/configuration.html

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
66
name = "aind-data-asset-indexer"
77
description = "Service Capsule to write data asset metadata to document store"
88
license = {text = "MIT"}
9-
requires-python = ">=3.8"
9+
requires-python = ">=3.9"
1010
authors = [
1111
{name = "AIND"}
1212
]
@@ -24,7 +24,7 @@ dependencies = [
2424
"pymongo==4.3.3",
2525
"dask==2023.5.0",
2626
"aind-data-schema==1.2.0",
27-
"aind-codeocean-api==0.5.0",
27+
"codeocean==0.3.0",
2828
]
2929

3030
[project.optional-dependencies]
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Package"""
22

3-
__version__ = "0.13.0"
3+
__version__ = "0.14.0"

src/aind_data_asset_indexer/aind_bucket_indexer.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -272,14 +272,12 @@ def _resolve_schema_information(
272272
object_key = create_object_key(
273273
prefix=prefix, filename=core_schema_file_name
274274
)
275-
common_kwargs[
276-
"core_schema_info_in_root"
277-
] = get_dict_of_file_info(
278-
s3_client=s3_client,
279-
bucket=self.job_settings.s3_bucket,
280-
keys=[object_key],
281-
).get(
282-
object_key
275+
common_kwargs["core_schema_info_in_root"] = (
276+
get_dict_of_file_info(
277+
s3_client=s3_client,
278+
bucket=self.job_settings.s3_bucket,
279+
keys=[object_key],
280+
).get(object_key)
283281
)
284282
self._copy_file_from_root_to_subdir(**common_kwargs)
285283
# If field is null, a file exists in the root folder, and
@@ -424,9 +422,9 @@ def _process_docdb_record(
424422
)
425423
db = docdb_client[self.job_settings.doc_db_db_name]
426424
collection = db[self.job_settings.doc_db_collection_name]
427-
fields_to_update[
428-
"last_modified"
429-
] = datetime.utcnow().isoformat()
425+
fields_to_update["last_modified"] = (
426+
datetime.utcnow().isoformat()
427+
)
430428
response = collection.update_one(
431429
{"_id": docdb_record["_id"]},
432430
{"$set": fields_to_update},

src/aind_data_asset_indexer/codeocean_bucket_indexer.py

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,13 @@
1212

1313
import boto3
1414
import dask.bag as dask_bag
15-
import requests
16-
from aind_codeocean_api.codeocean import CodeOceanClient
1715
from aind_data_schema.core.metadata import ExternalPlatforms
16+
from codeocean import CodeOcean
17+
from codeocean.data_asset import DataAssetSearchOrigin, DataAssetSearchParams
1818
from mypy_boto3_s3 import S3Client
1919
from pymongo import MongoClient
2020
from pymongo.operations import UpdateOne
21-
from requests.exceptions import ReadTimeout
21+
from urllib3.util import Retry
2222

2323
from aind_data_asset_indexer.models import CodeOceanIndexBucketJobSettings
2424
from aind_data_asset_indexer.utils import (
@@ -52,30 +52,51 @@ def __init__(self, job_settings: CodeOceanIndexBucketJobSettings):
5252
"""Class constructor."""
5353
self.job_settings = job_settings
5454

55-
def _get_external_data_asset_records(self) -> Optional[List[dict]]:
55+
@staticmethod
56+
def _get_external_data_asset_records(
57+
co_client: CodeOcean,
58+
) -> Optional[List[dict]]:
5659
"""
5760
Retrieves list of code ocean ids and locations for external data
5861
assets. The timeout is set to 600 seconds.
62+
63+
Parameters
64+
----------
65+
co_client : CodeOcean
66+
5967
Returns
6068
-------
6169
List[dict] | None
6270
List items have shape {"id": str, "location": str}. If error occurs,
6371
return None.
72+
6473
"""
6574
try:
66-
response = requests.get(
67-
self.job_settings.temp_codeocean_endpoint,
68-
timeout=600,
75+
search_params = DataAssetSearchParams(
76+
archived=False,
77+
origin=DataAssetSearchOrigin.External,
78+
limit=1000,
6979
)
70-
if response.status_code == 200:
71-
return response.json()
72-
else:
73-
return None
74-
except ReadTimeout:
75-
logging.error(
76-
f"Read timed out at "
77-
f"{self.job_settings.temp_codeocean_endpoint}"
80+
data_assets = co_client.data_assets.search_data_assets_iterator(
81+
search_params=search_params
7882
)
83+
external_records = []
84+
for data_asset in data_assets:
85+
data_asset_source = data_asset.source_bucket
86+
if (
87+
data_asset_source is not None
88+
and data_asset_source.bucket is not None
89+
and data_asset_source.prefix is not None
90+
):
91+
bucket = data_asset_source.bucket
92+
prefix = data_asset_source.prefix
93+
location = f"s3://{bucket}/{prefix}"
94+
external_records.append(
95+
{"id": data_asset.id, "location": location}
96+
)
97+
return external_records
98+
except Exception as e:
99+
logging.exception(e)
79100
return None
80101

81102
@staticmethod
@@ -97,7 +118,7 @@ def _map_external_list_to_dict(external_recs: List[dict]) -> dict:
97118
"""
98119
new_records = dict()
99120
for r in external_recs:
100-
location = r.get("source")
121+
location = r.get("location")
101122
rec_id = r["id"]
102123
if location is not None and new_records.get(location) is not None:
103124
old_id_set = new_records.get(location)
@@ -140,7 +161,7 @@ def _get_co_links_from_record(
140161
return external_links
141162

142163
def _update_external_links_in_docdb(
143-
self, docdb_client: MongoClient
164+
self, docdb_client: MongoClient, co_client: CodeOcean
144165
) -> None:
145166
"""
146167
This method will:
@@ -159,7 +180,9 @@ def _update_external_links_in_docdb(
159180
160181
"""
161182
# Should return a list like [{"id": co_id, "location": "s3://..."},]
162-
list_of_co_ids_and_locations = self._get_external_data_asset_records()
183+
list_of_co_ids_and_locations = self._get_external_data_asset_records(
184+
co_client=co_client
185+
)
163186
db = docdb_client[self.job_settings.doc_db_db_name]
164187
collection = db[self.job_settings.doc_db_collection_name]
165188
if list_of_co_ids_and_locations is not None:
@@ -394,9 +417,16 @@ def _delete_records_from_docdb(self, record_list: List[str]):
394417
def run_job(self):
395418
"""Main method to run."""
396419
logging.info("Starting to scan through CodeOcean.")
397-
co_client = CodeOceanClient(
420+
retry = Retry(
421+
total=5,
422+
backoff_factor=1,
423+
status_forcelist=[429, 500, 502, 503, 504],
424+
allowed_methods=["GET", "POST"],
425+
)
426+
co_client = CodeOcean(
398427
domain=self.job_settings.codeocean_domain,
399428
token=self.job_settings.codeocean_token.get_secret_value(),
429+
retries=retry,
400430
)
401431
code_ocean_records = get_all_processed_codeocean_asset_records(
402432
co_client=co_client,
@@ -416,7 +446,7 @@ def run_job(self):
416446
# Use existing client to add external links to fields
417447
logging.info("Adding links to records.")
418448
self._update_external_links_in_docdb(
419-
docdb_client=iterator_docdb_client
449+
docdb_client=iterator_docdb_client, co_client=co_client
420450
)
421451
logging.info("Finished adding links to records")
422452
all_docdb_records = dict()

0 commit comments

Comments
 (0)