Skip to content

Commit 3e327c9

Browse files
authored
Merge pull request #159 from AllenNeuralDynamics/release-v0.19.0
Release v0.19.0
2 parents 5eed1c6 + 88cb9f9 commit 3e327c9

File tree

4 files changed

+204
-8
lines changed

4 files changed

+204
-8
lines changed

docs/source/UserGuide.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,10 @@ The workflow is generally as follows:
6262

6363
- If the metadata record exists in S3 but not in DocDB, copy it
6464
to DocDB.
65-
- If the metadata record does not exist in S3, create it and save
66-
it to S3. Assume a Lambda function will move it over to DocDB.
65+
- If the metadata record for a derived asset does not exist in S3,
66+
create it and save it to S3. Assume a Lambda function will move it
67+
over to DocDB. Metadata records for raw assets are created during
68+
the upload process, **not** by this job.
6769
- In both cases above, ensure the original metadata folder and core
6870
files are in sync with the metadata.nd.json file.
6971

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Package"""
22

3-
__version__ = "0.18.1"
3+
__version__ = "0.19.0"

src/aind_data_asset_indexer/aind_bucket_indexer.py

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
get_s3_location,
2020
paginate_docdb,
2121
)
22+
from aind_data_schema_models.data_name_patterns import DataLevel
2223
from mypy_boto3_s3 import S3Client
2324
from mypy_boto3_s3.type_defs import CopySourceTypeDef
2425
from requests.adapters import HTTPAdapter
@@ -67,8 +68,8 @@ class AindIndexBucketJob:
6768
4.0) If a metadata record exists, check if it is in DocDB.
6869
4.1) If already in DocDb, then don't do anything.
6970
Otherwise, copy record to DocDB.
70-
4.2) If a metadata record does not exist, then build one and save it to S3.
71-
Assume a lambda function will move it over to DocDb.
71+
4.2) If a metadata record does not exist and the asset is derived, then
72+
build and save it to S3. Assume a lambda function will move it to DocDB.
7273
4.3) In both cases above, ensure the original metadata folder and core
7374
files are in sync with the metadata.nd.json file.
7475
"""
@@ -535,6 +536,38 @@ def _process_records(self, records: List[dict]):
535536
)
536537
mapped_partitions.compute()
537538

539+
def _get_data_level_for_prefix(
540+
self, s3_client: S3Client, bucket: str, prefix: str
541+
) -> Optional[str]:
542+
"""
543+
Get an asset's data level from the data_description.json file.
544+
545+
Parameters
546+
----------
547+
s3_client : S3Client
548+
bucket : str
549+
prefix : str
550+
551+
Returns
552+
-------
553+
Optional[str]
554+
The data level of the asset. Returns None if data_description.json
555+
file is not found.
556+
"""
557+
data_desc_key = create_object_key(
558+
prefix=prefix, filename="data_description.json"
559+
)
560+
if does_s3_object_exist(
561+
s3_client=s3_client, bucket=bucket, key=data_desc_key
562+
):
563+
json_contents = download_json_file_from_s3(
564+
s3_client=s3_client,
565+
bucket=bucket,
566+
object_key=data_desc_key,
567+
)
568+
return json_contents.get("data_level") if json_contents else None
569+
return None
570+
538571
def _process_prefix(
539572
self,
540573
s3_prefix: str,
@@ -548,8 +581,9 @@ def _process_prefix(
548581
2) If record is in S3 but not DocDb, then copy it to DocDb if the
549582
location in the metadata record matches the actual location and
550583
the record has an _id field. Otherwise, log a warning.
551-
3) If record does not exist in both DocDB and S3, build a new metadata
552-
file and save it to S3 (assume Lambda function will save to DocDB).
584+
3) If record does not exist in both DocDB and S3, check the data level.
585+
For derived assets, build a new metadata file and save it to S3 (assume
586+
Lambda function will save to DocDB).
553587
4) In both cases above, we also copy the original core json files to a
554588
subfolder and ensure the top level core jsons are in sync with the
555589
metadata.nd.json in S3.
@@ -640,7 +674,12 @@ def _process_prefix(
640674
f"Metadata record for {s3_full_location} "
641675
f"already exists in DocDb. Skipping."
642676
)
643-
else: # metadata.nd.json file does not exist in S3. Create a new one.
677+
elif (
678+
self._get_data_level_for_prefix(
679+
s3_client=s3_client, bucket=bucket, prefix=s3_prefix
680+
)
681+
== DataLevel.DERIVED.value
682+
):
644683
# Build a new metadata file, save it to S3 and save it to DocDb.
645684
# Also copy the original core json files to a subfolder and then
646685
# overwrite them with the new fields from metadata.nd.json.
@@ -675,6 +714,11 @@ def _process_prefix(
675714
logging.warning(
676715
f"Unable to build metadata record for: {location}!"
677716
)
717+
else:
718+
logging.info(
719+
f"Metadata record for {location} not found in S3 and data "
720+
"level is not derived. Skipping."
721+
)
678722

679723
def _dask_task_to_process_prefix_list(self, prefix_list: List[str]):
680724
"""

tests/test_aind_bucket_indexer.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,6 +1144,144 @@ def test_process_records(self, mock_dask_bag_map_parts: MagicMock):
11441144
self.basic_job._process_records(example_records)
11451145
mock_dask_bag_map_parts.assert_called()
11461146

1147+
@patch(
1148+
"aind_data_asset_indexer.aind_bucket_indexer."
1149+
"download_json_file_from_s3"
1150+
)
1151+
@patch("aind_data_asset_indexer.aind_bucket_indexer.does_s3_object_exist")
1152+
def test_get_data_level_for_prefix(
1153+
self,
1154+
mock_does_s3_object_exist: MagicMock,
1155+
mock_download_json_file_from_s3: MagicMock,
1156+
):
1157+
"""Tests _get_data_level_for_prefix method."""
1158+
mock_s3_client = MagicMock()
1159+
mock_does_s3_object_exist.return_value = True
1160+
mock_download_json_file_from_s3.return_value = self.example_md_record1[
1161+
"data_description"
1162+
]
1163+
1164+
prefix = "ecephys_642478_2023-01-17_13-56-29"
1165+
data_level = self.basic_job._get_data_level_for_prefix(
1166+
s3_client=mock_s3_client,
1167+
bucket=self.basic_job.job_settings.s3_bucket,
1168+
prefix=prefix,
1169+
)
1170+
self.assertEqual("raw", data_level)
1171+
mock_does_s3_object_exist.assert_called_once_with(
1172+
s3_client=mock_s3_client,
1173+
bucket=self.basic_job.job_settings.s3_bucket,
1174+
key=f"{prefix}/data_description.json",
1175+
)
1176+
mock_download_json_file_from_s3.assert_called_once_with(
1177+
s3_client=mock_s3_client,
1178+
bucket=self.basic_job.job_settings.s3_bucket,
1179+
object_key=f"{prefix}/data_description.json",
1180+
)
1181+
1182+
@patch(
1183+
"aind_data_asset_indexer.aind_bucket_indexer."
1184+
"download_json_file_from_s3"
1185+
)
1186+
@patch("aind_data_asset_indexer.aind_bucket_indexer.does_s3_object_exist")
1187+
def test_get_data_level_for_prefix_no_file(
1188+
self,
1189+
mock_does_s3_object_exist: MagicMock,
1190+
mock_download_json_file_from_s3: MagicMock,
1191+
):
1192+
"""Tests _get_data_level_for_prefix method when there is no
1193+
data_description file."""
1194+
mock_s3_client = MagicMock()
1195+
mock_does_s3_object_exist.return_value = False
1196+
1197+
prefix = "ecephys_642478_2023-01-17_13-56-29"
1198+
data_level = self.basic_job._get_data_level_for_prefix(
1199+
s3_client=mock_s3_client,
1200+
bucket=self.basic_job.job_settings.s3_bucket,
1201+
prefix=prefix,
1202+
)
1203+
self.assertEqual(None, data_level)
1204+
mock_download_json_file_from_s3.assert_not_called()
1205+
1206+
@patch(
1207+
"aind_data_asset_indexer.aind_bucket_indexer."
1208+
"download_json_file_from_s3"
1209+
)
1210+
@patch("aind_data_asset_indexer.aind_bucket_indexer.does_s3_object_exist")
1211+
def test_get_data_level_for_prefix_invalid_file(
1212+
self,
1213+
mock_does_s3_object_exist: MagicMock,
1214+
mock_download_json_file_from_s3: MagicMock,
1215+
):
1216+
"""Tests _get_data_level_for_prefix method when the data_description
1217+
file is not valid json."""
1218+
mock_s3_client = MagicMock()
1219+
mock_does_s3_object_exist.return_value = True
1220+
mock_download_json_file_from_s3.return_value = None
1221+
1222+
prefix = "ecephys_642478_2023-01-17_13-56-29"
1223+
data_level = self.basic_job._get_data_level_for_prefix(
1224+
s3_client=mock_s3_client,
1225+
bucket=self.basic_job.job_settings.s3_bucket,
1226+
prefix=prefix,
1227+
)
1228+
self.assertEqual(None, data_level)
1229+
1230+
@patch(
1231+
"aind_data_asset_indexer.aind_bucket_indexer."
1232+
"upload_metadata_json_str_to_s3"
1233+
)
1234+
@patch(
1235+
"aind_data_asset_indexer.aind_bucket_indexer."
1236+
"cond_copy_then_sync_core_json_files"
1237+
)
1238+
@patch(
1239+
"aind_data_asset_indexer.aind_bucket_indexer."
1240+
"build_metadata_record_from_prefix"
1241+
)
1242+
@patch(
1243+
"aind_data_asset_indexer.aind_bucket_indexer.AindIndexBucketJob"
1244+
"._get_data_level_for_prefix"
1245+
)
1246+
@patch("aind_data_asset_indexer.aind_bucket_indexer.does_s3_object_exist")
1247+
@patch("aind_data_asset_indexer.aind_bucket_indexer.MetadataDbClient")
1248+
@patch("boto3.client")
1249+
def test_process_prefix_no_record_no_file_derived_no(
1250+
self,
1251+
mock_s3_client: MagicMock,
1252+
mock_docdb_client: MagicMock,
1253+
mock_does_s3_object_exist: MagicMock,
1254+
mock_get_data_level_for_prefix: MagicMock,
1255+
mock_build_metadata_record_from_prefix: MagicMock,
1256+
mock_cond_copy_then_sync_core_json_files: MagicMock,
1257+
mock_upload_metadata_json_str_to_s3: MagicMock,
1258+
):
1259+
"""Tests _process_prefix method when there is no record in DocDb,
1260+
there is no metadata.nd.json file in S3, and the
1261+
asset data level is not derived."""
1262+
1263+
mock_does_s3_object_exist.return_value = False
1264+
mock_get_data_level_for_prefix.return_value = "raw"
1265+
mock_build_metadata_record_from_prefix.return_value = None
1266+
1267+
location_to_id_map = dict()
1268+
with self.assertLogs(level="DEBUG") as captured:
1269+
self.basic_job._process_prefix(
1270+
s3_prefix="ecephys_642478_2023-01-17_13-56-29",
1271+
docdb_client=mock_docdb_client,
1272+
s3_client=mock_s3_client,
1273+
location_to_id_map=location_to_id_map,
1274+
)
1275+
expected_log_messages = [
1276+
"INFO:root:Metadata record for "
1277+
"s3://aind-ephys-data-dev-u5u0i5/"
1278+
"ecephys_642478_2023-01-17_13-56-29 not found in S3 and data "
1279+
"level is not derived. Skipping."
1280+
]
1281+
self.assertEqual(expected_log_messages, captured.output)
1282+
mock_cond_copy_then_sync_core_json_files.assert_not_called()
1283+
mock_upload_metadata_json_str_to_s3.assert_not_called()
1284+
11471285
@patch(
11481286
"aind_data_asset_indexer.aind_bucket_indexer."
11491287
"upload_metadata_json_str_to_s3"
@@ -1156,6 +1294,10 @@ def test_process_records(self, mock_dask_bag_map_parts: MagicMock):
11561294
"aind_data_asset_indexer.aind_bucket_indexer."
11571295
"build_metadata_record_from_prefix"
11581296
)
1297+
@patch(
1298+
"aind_data_asset_indexer.aind_bucket_indexer.AindIndexBucketJob."
1299+
"_get_data_level_for_prefix"
1300+
)
11591301
@patch("aind_data_asset_indexer.aind_bucket_indexer.does_s3_object_exist")
11601302
@patch("aind_data_asset_indexer.aind_bucket_indexer.MetadataDbClient")
11611303
@patch("boto3.client")
@@ -1164,6 +1306,7 @@ def test_process_prefix_no_record_no_file_build_no(
11641306
mock_s3_client: MagicMock,
11651307
mock_docdb_client: MagicMock,
11661308
mock_does_s3_object_exist: MagicMock,
1309+
mock_get_data_level_for_prefix: MagicMock,
11671310
mock_build_metadata_record_from_prefix: MagicMock,
11681311
mock_cond_copy_then_sync_core_json_files: MagicMock,
11691312
mock_upload_metadata_json_str_to_s3: MagicMock,
@@ -1173,6 +1316,7 @@ def test_process_prefix_no_record_no_file_build_no(
11731316
build_metadata_record_from_prefix returns a None."""
11741317

11751318
mock_does_s3_object_exist.return_value = False
1319+
mock_get_data_level_for_prefix.return_value = "derived"
11761320
mock_build_metadata_record_from_prefix.return_value = None
11771321

11781322
location_to_id_map = dict()
@@ -1204,6 +1348,10 @@ def test_process_prefix_no_record_no_file_build_no(
12041348
"aind_data_asset_indexer.aind_bucket_indexer."
12051349
"build_metadata_record_from_prefix"
12061350
)
1351+
@patch(
1352+
"aind_data_asset_indexer.aind_bucket_indexer.AindIndexBucketJob."
1353+
"_get_data_level_for_prefix"
1354+
)
12071355
@patch("aind_data_asset_indexer.aind_bucket_indexer.does_s3_object_exist")
12081356
@patch("aind_data_asset_indexer.aind_bucket_indexer.MetadataDbClient")
12091357
@patch("boto3.client")
@@ -1212,6 +1360,7 @@ def test_process_prefix_no_record_no_file_build_yes(
12121360
mock_s3_client: MagicMock,
12131361
mock_docdb_client: MagicMock,
12141362
mock_does_s3_object_exist: MagicMock,
1363+
mock_get_data_level_for_prefix: MagicMock,
12151364
mock_build_metadata_record_from_prefix: MagicMock,
12161365
mock_cond_copy_then_sync_core_json_files: MagicMock,
12171366
mock_upload_metadata_json_str_to_s3: MagicMock,
@@ -1222,6 +1371,7 @@ def test_process_prefix_no_record_no_file_build_yes(
12221371

12231372
expected_prefix = "ecephys_642478_2023-01-17_13-56-29"
12241373
mock_does_s3_object_exist.return_value = False
1374+
mock_get_data_level_for_prefix.return_value = "derived"
12251375
mock_build_metadata_record_from_prefix.return_value = json.dumps(
12261376
self.example_md_record
12271377
)

0 commit comments

Comments
 (0)