Skip to content

Commit 78c4c50

Browse files
authored
Merge pull request #176 from AllenNeuralDynamics/release-v0.21.0
Release v0.21.0
2 parents 2d2f96d + 57bc466 commit 78c4c50

File tree

7 files changed

+140
-175
lines changed

7 files changed

+140
-175
lines changed

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ dependencies = [
2323
"pydantic>=2.10",
2424
"dask==2023.5.0",
2525
"aind-data-schema==1.4.0",
26-
"codeocean==0.3.0",
27-
"aind-data-access-api>=1.6.0",
26+
"codeocean>=0.10.0",
27+
"aind-data-access-api>=1.9.0",
2828
]
2929

3030
[project.optional-dependencies]
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Package"""
22

3-
__version__ = "0.20.2"
3+
__version__ = "0.21.0"

src/aind_data_asset_indexer/codeocean_bucket_indexer.py

Lines changed: 14 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,24 @@
22
DocDB."""
33

44
import argparse
5-
import json
65
import logging
76
import os
87
import sys
98
import warnings
109
from typing import List, Optional
1110

12-
import boto3
1311
import dask.bag as dask_bag
1412
import requests
1513
from aind_data_access_api.document_db import MetadataDbClient
16-
from aind_data_access_api.utils import get_s3_bucket_and_prefix, paginate_docdb
14+
from aind_data_access_api.utils import paginate_docdb
1715
from aind_data_schema.core.metadata import ExternalPlatforms
1816
from codeocean import CodeOcean
1917
from codeocean.data_asset import DataAssetSearchOrigin, DataAssetSearchParams
20-
from mypy_boto3_s3 import S3Client
2118
from requests.adapters import HTTPAdapter
2219
from urllib3.util import Retry
2320

2421
from aind_data_asset_indexer.models import CodeOceanIndexBucketJobSettings
2522
from aind_data_asset_indexer.utils import (
26-
build_metadata_record_from_prefix,
2723
get_all_processed_codeocean_asset_records,
2824
)
2925

@@ -264,48 +260,31 @@ def _process_codeocean_record(
264260
self,
265261
codeocean_record: dict,
266262
docdb_client: MetadataDbClient,
267-
s3_client: S3Client,
268263
):
269264
"""
270265
Processes a code ocean record. It's assumed that the check to verify
271266
the record is not in DocDB is done upstream.
272-
1) Using the s3 location in the codeocean record, build metadata file.
273-
2) Save metadata record to DocDB if no issue
267+
1) Uses the codeocean record info to register the asset to DocDB.
274268
275269
Parameters
276270
----------
277271
codeocean_record : dict
278272
docdb_client : MetadataDbClient
279-
s3_client : S3Client
280273
281274
"""
282-
location = codeocean_record["location"]
283-
created = codeocean_record["created"]
284-
external_links = codeocean_record["external_links"]
285275
name = codeocean_record["name"]
286-
url_parts = get_s3_bucket_and_prefix(location)
287-
bucket = url_parts["bucket"]
288-
s3_prefix = url_parts["prefix"]
289-
new_metadata_contents = build_metadata_record_from_prefix(
290-
bucket=bucket,
291-
prefix=s3_prefix,
292-
s3_client=s3_client,
293-
optional_name=name,
294-
optional_created=created,
295-
optional_external_links=external_links,
276+
location = codeocean_record["location"]
277+
co_asset_id = codeocean_record["co_asset_id"]
278+
co_computation_id = codeocean_record["co_computation_id"]
279+
280+
logging.info(f"Uploading metadata record for: {location}")
281+
register_response = docdb_client.register_co_result(
282+
s3_location=location,
283+
name=name,
284+
co_asset_id=co_asset_id,
285+
co_computation_id=co_computation_id,
296286
)
297-
if new_metadata_contents is not None:
298-
logging.info(f"Uploading metadata record for: {location}")
299-
# noinspection PyTypeChecker
300-
json_contents = json.loads(new_metadata_contents)
301-
response = docdb_client.insert_one_docdb_record(
302-
record=json_contents
303-
)
304-
logging.debug(response.json())
305-
else:
306-
logging.warning(
307-
f"Unable to build metadata record for: {location}!"
308-
)
287+
logging.info(register_response)
309288

310289
def _dask_task_to_process_record_list(self, record_list: List[dict]):
311290
"""
@@ -318,15 +297,13 @@ def _dask_task_to_process_record_list(self, record_list: List[dict]):
318297
record_list : List[dict]
319298
320299
"""
321-
# create clients here since dask doesn't serialize them
322-
s3_client = boto3.client("s3")
300+
# create client here since dask doesn't serialize them
323301
with self._create_docdb_client() as doc_db_client:
324302
for record in record_list:
325303
try:
326304
self._process_codeocean_record(
327305
codeocean_record=record,
328306
docdb_client=doc_db_client,
329-
s3_client=s3_client,
330307
)
331308
except requests.HTTPError as e:
332309
logging.error(
@@ -337,7 +314,6 @@ def _dask_task_to_process_record_list(self, record_list: List[dict]):
337314
logging.error(
338315
f'Error processing {record.get("location")}: {repr(e)}'
339316
)
340-
s3_client.close()
341317

342318
def _process_codeocean_records(self, records: List[dict]):
343319
"""

src/aind_data_asset_indexer/utils.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import json
55
import logging
66
import re
7-
from datetime import datetime, timezone
7+
from datetime import datetime
88
from json.decoder import JSONDecodeError
99
from typing import Dict, Iterator, List, Optional
1010
from urllib.parse import urlparse
@@ -13,7 +13,6 @@
1313
from aind_data_schema.core.data_description import DataLevel, DataRegex
1414
from aind_data_schema.core.metadata import CORE_FILES as CORE_SCHEMAS
1515
from aind_data_schema.core.metadata import (
16-
ExternalPlatforms,
1716
Metadata,
1817
create_metadata_json,
1918
)
@@ -798,12 +797,13 @@ def get_all_processed_codeocean_asset_records(
798797
Returns
799798
-------
800799
Dict[str, dict]
801-
{data_asset_location:
802-
{"name": data_asset_name,
803-
"location": data_asset_location,
804-
"created": data_asset_created,
805-
"external_links": {"Code Ocean": [data_asset_id]}
806-
}
800+
{
801+
data_asset_location: {
802+
"name": data_asset_name,
803+
"location": data_asset_location,
804+
"co_asset_id": data_asset_id,
805+
"co_computation_id": data_asset_computation_id,
806+
}
807807
}
808808
809809
"""
@@ -826,24 +826,33 @@ def get_all_processed_codeocean_asset_records(
826826
for data_asset_info in iter_response:
827827
data_asset_id = data_asset_info.id
828828
data_asset_name = data_asset_info.name
829-
created_timestamp = data_asset_info.created
830-
created_datetime = datetime.fromtimestamp(
831-
created_timestamp, tz=timezone.utc
832-
)
829+
# Get source computation id from provenance
830+
if (
831+
data_asset_info.provenance is not None
832+
and data_asset_info.provenance.computation is not None
833+
):
834+
data_asset_computation_id = (
835+
data_asset_info.provenance.computation
836+
)
837+
else:
838+
data_asset_computation_id = None
839+
logging.warning(
840+
f"Data asset {data_asset_id}, {data_asset_name} does not"
841+
"have computation provenance!"
842+
)
833843
# Results hosted externally have a source_bucket field
834844
is_external = data_asset_info.source_bucket is not None
835845
if (
836846
not is_external
837847
and data_asset_info.state == DataAssetState.Ready
848+
and data_asset_computation_id is not None
838849
):
839850
location = f"s3://{co_data_asset_bucket}/{data_asset_id}"
840851
extracted_info[location] = {
841852
"name": data_asset_name,
842853
"location": location,
843-
"created": created_datetime,
844-
"external_links": {
845-
ExternalPlatforms.CODEOCEAN.value: [data_asset_id]
846-
},
854+
"co_asset_id": data_asset_id,
855+
"co_computation_id": data_asset_computation_id,
847856
}
848857
# Occasionally, there are duplicate items returned. This is one
849858
# way to remove the duplicates.

tests/resources/utils/example_search_co_assets.json

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"provenance": {
1717
"capsule": "8f8f888f-8888-88f8-fff8-888888ff8f8f",
1818
"commit": "3333333333333f33be33ae33fb33333333e33333",
19+
"computation": "77a777aa-a77a-7a77-a7aa-77777a777aaa",
1920
"data_assets": [
2021
"aaa3a3aa-a33a-3aa3-a33a-aa3aaa333a3a"
2122
],
@@ -47,6 +48,7 @@
4748
"provenance": {
4849
"capsule": "8f8f888f-8888-88f8-fff8-888888ff8f8f",
4950
"commit": "3333333333333f33be33ae33fb33333333e33333",
51+
"computation": "2ff2222f-ff22-2f22-2222-ff22fff22f22",
5052
"data_assets": [
5153
"d11dd111-111d-1d11-d111-d1d11111dd11"
5254
],
@@ -61,6 +63,61 @@
6163
"ecephys"
6264
],
6365
"type": "result"
66+
},
67+
{
68+
"created": 1720045200,
69+
"custom_metadata": {
70+
"data level": "derived",
71+
"experiment type": "ecephys",
72+
"subject id": "734567"
73+
},
74+
"description": "",
75+
"files": 8921,
76+
"id": "888888ee-88ee-8e88-888e-8e88e8888888",
77+
"last_used": 1720045470,
78+
"mount": "ecephys_734567_2024-07-03_09-15-30_sorted_2024-07-04_08-00-00",
79+
"name": "ecephys_734567_2024-07-03_09-15-30_sorted_2024-07-04_08-00-00",
80+
"provenance": null,
81+
"size": 17234567890,
82+
"state": "ready",
83+
"tags": [
84+
"derived",
85+
"734567",
86+
"ecephys"
87+
],
88+
"type": "result"
89+
},
90+
{
91+
"created": 1719435600,
92+
"custom_metadata": {
93+
"data level": "derived",
94+
"experiment type": "ecephys",
95+
"subject id": "723456"
96+
},
97+
"description": "",
98+
"files": 7543,
99+
"id": "999999dd-99dd-9d99-999d-9d99d9999999",
100+
"last_used": 1719435870,
101+
"mount": "ecephys_723456_2024-06-26_14-30-15_sorted_2024-06-27_10-00-00",
102+
"name": "ecephys_723456_2024-06-26_14-30-15_sorted_2024-06-27_10-00-00",
103+
"provenance": {
104+
"capsule": "8f8f888f-8888-88f8-fff8-888888ff8f8f",
105+
"commit": "3333333333333f33be33ae33fb33333333e33333",
106+
"data_assets": [
107+
"e22ee222-222e-2e22-e222-e2e22222ee22"
108+
],
109+
"computation": null,
110+
"docker_image": "",
111+
"run_script": "code/run"
112+
},
113+
"size": 15847923456,
114+
"state": "ready",
115+
"tags": [
116+
"derived",
117+
"723456",
118+
"ecephys"
119+
],
120+
"type": "result"
64121
}
65122
]
66123
}

0 commit comments

Comments
 (0)