99from typing import Dict , Iterator , List , Optional
1010from urllib .parse import urlparse
1111
12- from aind_codeocean_api .codeocean import CodeOceanClient
1312from aind_data_schema .core .data_description import DataLevel , DataRegex
1413from aind_data_schema .core .metadata import CORE_FILES as CORE_SCHEMAS
1514from aind_data_schema .core .metadata import (
1817 create_metadata_json ,
1918)
2019from botocore .exceptions import ClientError
20+ from codeocean import CodeOcean
21+ from codeocean .data_asset import (
22+ DataAssetSearchParams ,
23+ DataAssetState ,
24+ DataAssetType ,
25+ )
2126from mypy_boto3_s3 import S3Client
2227from mypy_boto3_s3 .type_defs import (
2328 PaginatorConfigTypeDef ,
@@ -934,7 +939,7 @@ def build_docdb_location_to_id_map(
934939
935940
936941def get_all_processed_codeocean_asset_records (
937- co_client : CodeOceanClient , co_data_asset_bucket : str
942+ co_client : CodeOcean , co_data_asset_bucket : str
938943) -> Dict [str , dict ]:
939944 """
940945 Gets all the data asset records we're interested in indexing. The location
@@ -943,7 +948,7 @@ def get_all_processed_codeocean_asset_records(
943948
944949 Parameters
945950 ----------
946- co_client : CodeOceanClient
951+ co_client : CodeOcean
947952 co_data_asset_bucket : str
948953 Name of Code Ocean's data asset bucket
949954 Returns
@@ -966,31 +971,27 @@ def get_all_processed_codeocean_asset_records(
966971 all_responses = dict ()
967972
968973 for tag in {DataLevel .DERIVED .value , "processed" }:
969- response = co_client .search_all_data_assets (
970- type = "result" , query = f"tag:{ tag } "
974+ search_params = DataAssetSearchParams (
975+ type = DataAssetType .Result , query = f"tag:{ tag } "
976+ )
977+ iter_response = co_client .data_assets .search_data_assets_iterator (
978+ search_params = search_params
971979 )
972- # There is a bug with the codeocean api that caps the number of
973- # results in a single request to 10000.
974- if len (response .json ()["results" ]) >= 10000 :
975- logging .warning (
976- "Number of records exceeds 10,000! This can lead to "
977- "possible data loss."
978- )
979980 # Extract relevant information
980981 extracted_info = dict ()
981- for data_asset_info in response . json ()[ "results" ] :
982- data_asset_id = data_asset_info [ "id" ]
983- data_asset_name = data_asset_info [ " name" ]
984- created_timestamp = data_asset_info [ " created" ]
982+ for data_asset_info in iter_response :
983+ data_asset_id = data_asset_info . id
984+ data_asset_name = data_asset_info . name
985+ created_timestamp = data_asset_info . created
985986 created_datetime = datetime .fromtimestamp (
986987 created_timestamp , tz = timezone .utc
987988 )
988989 # Results hosted externally have a source_bucket field
989- is_external = (
990- data_asset_info . get ( "sourceBucket" ) is not None
991- or data_asset_info . get ( "source_bucket" ) is not None
992- )
993- if not is_external and data_asset_info . get ( "state" ) == "ready" :
990+ is_external = data_asset_info . source_bucket is not None
991+ if (
992+ not is_external
993+ and data_asset_info . state == DataAssetState . Ready
994+ ) :
994995 location = f"s3://{ co_data_asset_bucket } /{ data_asset_id } "
995996 extracted_info [location ] = {
996997 "name" : data_asset_name ,
0 commit comments