Skip to content

Commit 12e71b8

Browse files
authored
feat: update record with actual s3 location if mismatch (#146)
* feat: update record with actual s3 location if mismatch * test: update unit tests * feat: allow name mismatch with warning * fix: outdated warning message
1 parent d144dcf commit 12e71b8

File tree

4 files changed

+96
-61
lines changed

4 files changed

+96
-61
lines changed

src/aind_data_asset_indexer/aind_bucket_indexer.py

Lines changed: 38 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import warnings
99
from datetime import datetime, timedelta, timezone
1010
from typing import Dict, List, Optional
11+
from uuid import uuid4
1112

1213
import boto3
1314
import dask.bag as dask_bag
@@ -388,9 +389,8 @@ def _process_docdb_record(
388389
docdb_record, self.job_settings.s3_bucket
389390
):
390391
logging.warning(
391-
f"Record location {docdb_record.get('location')} or name "
392-
f"{docdb_record.get('name')} not valid for bucket "
393-
f"{self.job_settings.s3_bucket}!"
392+
f"Record location {docdb_record.get('location')} not valid "
393+
f"for bucket {self.job_settings.s3_bucket}! Skipping."
394394
)
395395
else:
396396
s3_parts = get_s3_bucket_and_prefix(docdb_record["location"])
@@ -587,10 +587,9 @@ def _process_prefix(
587587
key=object_key,
588588
)
589589
if does_metadata_file_exist:
590-
# If record not in DocDb, then copy it to DocDb if the location
591-
# in the metadata record matches the location the record lives in
592-
# Otherwise, log a warning that the metadata record location does
593-
# not make sense.
590+
# If record not in DocDb, then copy it to DocDb.
591+
# If location in the record does not match the actual s3 location,
592+
# first log a warning and correct the record.
594593
s3_full_location = get_s3_location(bucket, object_key)
595594
if record_id is None:
596595
json_contents = download_json_file_from_s3(
@@ -600,40 +599,44 @@ def _process_prefix(
600599
)
601600
if json_contents:
602601
# noinspection PyTypeChecker
603-
if is_record_location_valid(
602+
if not is_record_location_valid(
604603
json_contents,
605604
expected_bucket=bucket,
606605
expected_prefix=s3_prefix,
607606
):
608-
if "_id" in json_contents:
609-
logging.info(
610-
f"Adding record to docdb for: {location}"
611-
)
612-
response = docdb_client.insert_one_docdb_record(
613-
record=json_contents
614-
)
615-
logging.debug(response.json())
616-
cond_copy_then_sync_core_json_files(
617-
metadata_json=json.dumps(
618-
json_contents, default=str
619-
),
620-
bucket=bucket,
621-
prefix=s3_prefix,
622-
s3_client=s3_client,
623-
copy_original_md_subdir=(
624-
self.job_settings.copy_original_md_subdir
625-
),
626-
)
627-
else:
628-
logging.warning(
629-
f"Metadata record for {location} "
630-
f"does not have an _id field!"
631-
)
632-
else:
633607
logging.warning(
634608
f"Location field {json_contents.get('location')} "
635-
f"or name field {json_contents.get('name')} does "
636-
f"not match actual location of record {location}!"
609+
"does not match actual location of record "
610+
f"{location}! Updating record with correct "
611+
"location and new id."
612+
)
613+
json_contents.update(
614+
{
615+
"_id": str(uuid4()),
616+
"location": location,
617+
}
618+
)
619+
if "_id" in json_contents:
620+
logging.info(f"Adding record to docdb for: {location}")
621+
response = docdb_client.insert_one_docdb_record(
622+
record=json_contents
623+
)
624+
logging.debug(response.json())
625+
cond_copy_then_sync_core_json_files(
626+
metadata_json=json.dumps(
627+
json_contents, default=str
628+
),
629+
bucket=bucket,
630+
prefix=s3_prefix,
631+
s3_client=s3_client,
632+
copy_original_md_subdir=(
633+
self.job_settings.copy_original_md_subdir
634+
),
635+
)
636+
else:
637+
logging.warning(
638+
f"Metadata record for {location} "
639+
f"does not have an _id field!"
637640
)
638641
else:
639642
logging.warning(

src/aind_data_asset_indexer/utils.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,9 @@ def is_record_location_valid(
122122
like 's3://{expected_bucket}/prefix'
123123
Will return False if there is no s3 scheme, the bucket does not match
124124
the expected bucket, the prefix contains forward slashes, or the prefix
125-
is invalid, or doesn't match the record name or expected prefix.
125+
is invalid or does not match the expected prefix.
126+
If the record name does not match the prefix, a warning is logged,
127+
but the method will still return True.
126128
127129
"""
128130
expected_stripped_prefix = (
@@ -142,15 +144,18 @@ def is_record_location_valid(
142144
stripped_prefix == ""
143145
or len(stripped_prefix.split("/")) > 1
144146
or not is_prefix_valid(stripped_prefix)
145-
or record.get("name") != stripped_prefix
146147
or (
147148
expected_prefix is not None
148149
and stripped_prefix != expected_stripped_prefix
149150
)
150151
):
151152
return False
152-
else:
153-
return True
153+
elif record.get("name") != stripped_prefix:
154+
logging.warning(
155+
f"Record name {record.get('name')} does not match "
156+
f"prefix {stripped_prefix}."
157+
)
158+
return True
154159

155160

156161
def compute_md5_hash(json_contents: str) -> str:

tests/test_aind_bucket_indexer.py

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -759,9 +759,8 @@ def test_process_docdb_record_invalid_location(
759759
)
760760
expected_log_messages = [
761761
"WARNING:root:Record location "
762-
"no_s3/bucket/prefix1_2024-01-01_01-01-01 or name "
763-
"prefix1_2024-01-01_01-01-01 not valid for bucket "
764-
"aind-ephys-data-dev-u5u0i5!"
762+
"no_s3/bucket/prefix1_2024-01-01_01-01-01 not valid "
763+
"for bucket aind-ephys-data-dev-u5u0i5! Skipping."
765764
]
766765
self.assertEqual(expected_log_messages, captured.output)
767766
self.assertIsNone(docdb_id_to_delete)
@@ -787,8 +786,8 @@ def test_process_docdb_record_invalid_prefix(
787786
},
788787
)
789788
expected_log_messages = [
790-
"WARNING:root:Record location s3://bucket/prefix1 or name prefix1 "
791-
"not valid for bucket aind-ephys-data-dev-u5u0i5!"
789+
"WARNING:root:Record location s3://bucket/prefix1 "
790+
"not valid for bucket aind-ephys-data-dev-u5u0i5! Skipping."
792791
]
793792
self.assertEqual(expected_log_messages, captured.output)
794793
self.assertIsNone(docdb_id_to_delete)
@@ -1444,6 +1443,7 @@ def test_process_prefix_no_record_yes_file_good_file_no__id(
14441443
mock_cond_copy_then_sync_core_json_files.assert_not_called()
14451444
mock_upload_metadata_json_str_to_s3.assert_not_called()
14461445

1446+
@patch("aind_data_asset_indexer.aind_bucket_indexer.uuid4")
14471447
@patch(
14481448
"aind_data_asset_indexer.aind_bucket_indexer."
14491449
"upload_metadata_json_str_to_s3"
@@ -1467,6 +1467,7 @@ def test_process_prefix_no_record_yes_file_good_file_bad_location(
14671467
mock_download_json_file_from_s3: MagicMock,
14681468
mock_cond_copy_then_sync_core_json_files: MagicMock,
14691469
mock_upload_metadata_json_str_to_s3: MagicMock,
1470+
mock_uuid: MagicMock,
14701471
):
14711472
"""Tests _process_prefix method when there is no record in DocDb,
14721473
there is and there is metadata.nd.json file in S3, and the file can
@@ -1475,12 +1476,15 @@ def test_process_prefix_no_record_yes_file_good_file_bad_location(
14751476
mock_does_s3_object_exist.return_value = True
14761477
# Test what happens when the location in the record does not match the
14771478
# expected location
1478-
mocked_downloaded_record = deepcopy(self.example_md_record)
1479-
mocked_downloaded_record["location"] = (
1480-
f"s3://{self.basic_job.job_settings.s3_bucket}/"
1481-
f"ecephys_642478_2020-01-10_10-10-10"
1479+
mock_download_json_file_from_s3.return_value = {
1480+
"_id": "5ca4a951-d374-4f4b-8279-d570a35b2286",
1481+
"name": "ecephys_642478_2023-01-17_13-56-29",
1482+
"location": "s3://bucket/ecephys_642478_2020-01-10_10-10-10",
1483+
}
1484+
mock_uuid.return_value = "a62344ff-1cec-48f4-914e-7482797e6332"
1485+
mock_docdb_client.insert_one_docdb_record.return_value = MagicMock(
1486+
status_code=200, json=MagicMock(return_value="inserted")
14821487
)
1483-
mock_download_json_file_from_s3.return_value = mocked_downloaded_record
14841488

14851489
location_to_id_map = dict()
14861490
with self.assertLogs(level="DEBUG") as captured:
@@ -1490,16 +1494,34 @@ def test_process_prefix_no_record_yes_file_good_file_bad_location(
14901494
s3_client=mock_s3_client,
14911495
location_to_id_map=location_to_id_map,
14921496
)
1497+
actual_location = (
1498+
f"s3://{self.basic_job.job_settings.s3_bucket}/"
1499+
"ecephys_642478_2023-01-17_13-56-29"
1500+
)
14931501
expected_log_messages = [
1494-
"WARNING:root:Location field s3://aind-ephys-data-dev-u5u0i5/"
1495-
"ecephys_642478_2020-01-10_10-10-10 or name field "
1496-
"ecephys_642478_2023-01-17_13-56-29 does not match actual "
1497-
"location of record s3://aind-ephys-data-dev-u5u0i5/"
1498-
"ecephys_642478_2023-01-17_13-56-29!"
1502+
"WARNING:root:Location field s3://bucket/"
1503+
"ecephys_642478_2020-01-10_10-10-10 does not match actual "
1504+
f"location of record {actual_location}! Updating "
1505+
"record with correct location and new id.",
1506+
f"INFO:root:Adding record to docdb for: {actual_location}",
1507+
"DEBUG:root:inserted",
14991508
]
1509+
expected_record = {
1510+
"_id": mock_uuid.return_value,
1511+
"name": "ecephys_642478_2023-01-17_13-56-29",
1512+
"location": actual_location,
1513+
}
15001514
self.assertEqual(expected_log_messages, captured.output)
1501-
mock_docdb_client.insert_one_docdb_record.assert_not_called()
1502-
mock_cond_copy_then_sync_core_json_files.assert_not_called()
1515+
mock_docdb_client.insert_one_docdb_record.assert_called_once_with(
1516+
record=expected_record
1517+
)
1518+
mock_cond_copy_then_sync_core_json_files.assert_called_once_with(
1519+
metadata_json=json.dumps(expected_record),
1520+
bucket=self.basic_job.job_settings.s3_bucket,
1521+
prefix="ecephys_642478_2023-01-17_13-56-29",
1522+
s3_client=mock_s3_client,
1523+
copy_original_md_subdir="original_metadata",
1524+
)
15031525
mock_upload_metadata_json_str_to_s3.assert_not_called()
15041526

15051527
@patch("aind_data_asset_indexer.aind_bucket_indexer.does_s3_object_exist")

tests/test_utils.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -289,20 +289,25 @@ def test_is_record_location_valid_false4(self):
289289
)
290290
)
291291

292-
def test_is_record_location_valid_false5(self):
293-
"""Tests is_record_location_valid returns false when prefix does
292+
def test_is_record_location_valid_true2(self):
293+
"""Tests is_record_location_valid returns true when prefix does
294294
not match name"""
295295

296296
example_record = {
297297
"_id": "abc-123",
298298
"name": "prefix2_2024-01-01_01-01-01",
299299
"location": "s3://some_bucket/prefix1_2024-01-01_01-01-01",
300300
}
301-
self.assertFalse(
302-
is_record_location_valid(
301+
with self.assertLogs(level="DEBUG") as captured:
302+
result = is_record_location_valid(
303303
example_record, expected_bucket="some_bucket"
304304
)
305-
)
305+
expected_log_messages = [
306+
"WARNING:root:Record name prefix2_2024-01-01_01-01-01 does not "
307+
"match prefix prefix1_2024-01-01_01-01-01."
308+
]
309+
self.assertEqual(expected_log_messages, captured.output)
310+
self.assertTrue(result)
306311

307312
def test_is_record_location_valid_false6(self):
308313
"""Tests is_record_location_valid returns false when prefixes don't

0 commit comments

Comments
 (0)