Skip to content

Commit c7a9e11

Browse files
authored
refactor: uses codeocean sdk (#111)
* refactor: uses codeocean sdk * build: updates min python supported
1 parent aa2ac2e commit c7a9e11

File tree

12 files changed

+68
-104
lines changed

12 files changed

+68
-104
lines changed

.github/workflows/publish_dev.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ jobs:
88
publish:
99
runs-on: ubuntu-latest
1010
steps:
11-
- uses: actions/checkout@v3
11+
- uses: actions/checkout@v4
1212
- name: Set up Docker Buildx
1313
id: buildx
1414
uses: docker/setup-buildx-action@v2

.github/workflows/publish_main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
outputs:
1212
pkg_version: ${{ steps.output_version.outputs.pkg_version }}
1313
steps:
14-
- uses: actions/checkout@v3
14+
- uses: actions/checkout@v4
1515
- name: Get version from file
1616
run: |
1717
pkg_name=$(grep -P 'version = \{attr = .*\}' pyproject.toml | grep -oP '\w+.__version__')

.github/workflows/run_dev_tests.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ jobs:
1010
runs-on: ubuntu-latest
1111
strategy:
1212
matrix:
13-
python-version: [ '3.8', '3.9', '3.10' ]
13+
python-version: [ '3.9', '3.10', '3.11' ]
1414
steps:
15-
- uses: actions/checkout@v3
15+
- uses: actions/checkout@v4
1616
- name: Set up Python ${{ matrix.python-version }}
17-
uses: actions/setup-python@v3
17+
uses: actions/setup-python@v5
1818
with:
1919
python-version: ${{ matrix.python-version }}
2020
- name: Install dependencies

.github/workflows/run_main_tests.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ jobs:
1111
runs-on: ubuntu-latest
1212
strategy:
1313
matrix:
14-
python-version: [ '3.8', '3.9', '3.10' ]
14+
python-version: [ '3.9', '3.10', '3.11' ]
1515
steps:
16-
- uses: actions/checkout@v3
16+
- uses: actions/checkout@v4
1717
- name: Set up Python ${{ matrix.python-version }}
18-
uses: actions/setup-python@v3
18+
uses: actions/setup-python@v5
1919
with:
2020
python-version: ${{ matrix.python-version }}
2121
- name: Install dependencies
@@ -28,7 +28,7 @@ jobs:
2828
verify_version:
2929
runs-on: ubuntu-latest
3030
steps:
31-
- uses: actions/checkout@v3
31+
- uses: actions/checkout@v4
3232
- name: Check version incremented
3333
run: |
3434
pkg_name=$(grep -P 'version = \{attr = .*\}' pyproject.toml | grep -oP '\w+.__version__')

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ kept in sync:
1212
1. **S3 buckets** store raw metadata files, including the ``metadata.nd.json``.
1313
2. A **document database (DocDB)** contains unstructured json
1414
documents describing the ``metadata.nd.json`` for a data asset.
15-
3. **Code Ocean**: data assets are mounted as CodeOcean data asssets.
15+
3. **Code Ocean**: data assets are mounted as CodeOcean data assets.
1616
Processed results are also stored in an internal Code Ocean bucket.
1717

1818
We have automated jobs to keep changes in DocDB and S3 in sync.

docs/source/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Configuration file for the Sphinx documentation builder."""
2+
23
#
34
# For the full list of built-in configuration values, see the documentation:
45
# https://www.sphinx-doc.org/en/master/usage/configuration.html

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
66
name = "aind-data-asset-indexer"
77
description = "Service Capsule to write data asset metadata to document store"
88
license = {text = "MIT"}
9-
requires-python = ">=3.8"
9+
requires-python = ">=3.9"
1010
authors = [
1111
{name = "AIND"}
1212
]
@@ -24,7 +24,7 @@ dependencies = [
2424
"pymongo==4.3.3",
2525
"dask==2023.5.0",
2626
"aind-data-schema==1.2.0",
27-
"aind-codeocean-api==0.5.0",
27+
"codeocean==0.3.0",
2828
]
2929

3030
[project.optional-dependencies]

src/aind_data_asset_indexer/aind_bucket_indexer.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -272,14 +272,12 @@ def _resolve_schema_information(
272272
object_key = create_object_key(
273273
prefix=prefix, filename=core_schema_file_name
274274
)
275-
common_kwargs[
276-
"core_schema_info_in_root"
277-
] = get_dict_of_file_info(
278-
s3_client=s3_client,
279-
bucket=self.job_settings.s3_bucket,
280-
keys=[object_key],
281-
).get(
282-
object_key
275+
common_kwargs["core_schema_info_in_root"] = (
276+
get_dict_of_file_info(
277+
s3_client=s3_client,
278+
bucket=self.job_settings.s3_bucket,
279+
keys=[object_key],
280+
).get(object_key)
283281
)
284282
self._copy_file_from_root_to_subdir(**common_kwargs)
285283
# If field is null, a file exists in the root folder, and
@@ -424,9 +422,9 @@ def _process_docdb_record(
424422
)
425423
db = docdb_client[self.job_settings.doc_db_db_name]
426424
collection = db[self.job_settings.doc_db_collection_name]
427-
fields_to_update[
428-
"last_modified"
429-
] = datetime.utcnow().isoformat()
425+
fields_to_update["last_modified"] = (
426+
datetime.utcnow().isoformat()
427+
)
430428
response = collection.update_one(
431429
{"_id": docdb_record["_id"]},
432430
{"$set": fields_to_update},

src/aind_data_asset_indexer/codeocean_bucket_indexer.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@
1313
import boto3
1414
import dask.bag as dask_bag
1515
import requests
16-
from aind_codeocean_api.codeocean import CodeOceanClient
1716
from aind_data_schema.core.metadata import ExternalPlatforms
17+
from codeocean import CodeOcean
1818
from mypy_boto3_s3 import S3Client
1919
from pymongo import MongoClient
2020
from pymongo.operations import UpdateOne
2121
from requests.exceptions import ReadTimeout
22+
from urllib3.util import Retry
2223

2324
from aind_data_asset_indexer.models import CodeOceanIndexBucketJobSettings
2425
from aind_data_asset_indexer.utils import (
@@ -394,9 +395,16 @@ def _delete_records_from_docdb(self, record_list: List[str]):
394395
def run_job(self):
395396
"""Main method to run."""
396397
logging.info("Starting to scan through CodeOcean.")
397-
co_client = CodeOceanClient(
398+
retry = Retry(
399+
total=5,
400+
backoff_factor=1,
401+
status_forcelist=[429, 500, 502, 503, 504],
402+
allowed_methods=["GET", "POST"],
403+
)
404+
co_client = CodeOcean(
398405
domain=self.job_settings.codeocean_domain,
399406
token=self.job_settings.codeocean_token.get_secret_value(),
407+
retries=retry,
400408
)
401409
code_ocean_records = get_all_processed_codeocean_asset_records(
402410
co_client=co_client,

src/aind_data_asset_indexer/utils.py

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from typing import Dict, Iterator, List, Optional
1010
from urllib.parse import urlparse
1111

12-
from aind_codeocean_api.codeocean import CodeOceanClient
1312
from aind_data_schema.core.data_description import DataLevel, DataRegex
1413
from aind_data_schema.core.metadata import CORE_FILES as CORE_SCHEMAS
1514
from aind_data_schema.core.metadata import (
@@ -18,6 +17,12 @@
1817
create_metadata_json,
1918
)
2019
from botocore.exceptions import ClientError
20+
from codeocean import CodeOcean
21+
from codeocean.data_asset import (
22+
DataAssetSearchParams,
23+
DataAssetState,
24+
DataAssetType,
25+
)
2126
from mypy_boto3_s3 import S3Client
2227
from mypy_boto3_s3.type_defs import (
2328
PaginatorConfigTypeDef,
@@ -934,7 +939,7 @@ def build_docdb_location_to_id_map(
934939

935940

936941
def get_all_processed_codeocean_asset_records(
937-
co_client: CodeOceanClient, co_data_asset_bucket: str
942+
co_client: CodeOcean, co_data_asset_bucket: str
938943
) -> Dict[str, dict]:
939944
"""
940945
Gets all the data asset records we're interested in indexing. The location
@@ -943,7 +948,7 @@ def get_all_processed_codeocean_asset_records(
943948
944949
Parameters
945950
----------
946-
co_client : CodeOceanClient
951+
co_client : CodeOcean
947952
co_data_asset_bucket : str
948953
Name of Code Ocean's data asset bucket
949954
Returns
@@ -966,31 +971,27 @@ def get_all_processed_codeocean_asset_records(
966971
all_responses = dict()
967972

968973
for tag in {DataLevel.DERIVED.value, "processed"}:
969-
response = co_client.search_all_data_assets(
970-
type="result", query=f"tag:{tag}"
974+
search_params = DataAssetSearchParams(
975+
type=DataAssetType.Result, query=f"tag:{tag}"
976+
)
977+
iter_response = co_client.data_assets.search_data_assets_iterator(
978+
search_params=search_params
971979
)
972-
# There is a bug with the codeocean api that caps the number of
973-
# results in a single request to 10000.
974-
if len(response.json()["results"]) >= 10000:
975-
logging.warning(
976-
"Number of records exceeds 10,000! This can lead to "
977-
"possible data loss."
978-
)
979980
# Extract relevant information
980981
extracted_info = dict()
981-
for data_asset_info in response.json()["results"]:
982-
data_asset_id = data_asset_info["id"]
983-
data_asset_name = data_asset_info["name"]
984-
created_timestamp = data_asset_info["created"]
982+
for data_asset_info in iter_response:
983+
data_asset_id = data_asset_info.id
984+
data_asset_name = data_asset_info.name
985+
created_timestamp = data_asset_info.created
985986
created_datetime = datetime.fromtimestamp(
986987
created_timestamp, tz=timezone.utc
987988
)
988989
# Results hosted externally have a source_bucket field
989-
is_external = (
990-
data_asset_info.get("sourceBucket") is not None
991-
or data_asset_info.get("source_bucket") is not None
992-
)
993-
if not is_external and data_asset_info.get("state") == "ready":
990+
is_external = data_asset_info.source_bucket is not None
991+
if (
992+
not is_external
993+
and data_asset_info.state == DataAssetState.Ready
994+
):
994995
location = f"s3://{co_data_asset_bucket}/{data_asset_id}"
995996
extracted_info[location] = {
996997
"name": data_asset_name,

0 commit comments

Comments
 (0)