Skip to content

Commit 3aae35e

Browse files
authored
hot-fix: original metadata checks (#83)
* fix: docdb_record_contents param typo * fix: substring matching for list s3 objects * fix: get file info for new file in root folder * fix: delete corrupt root file after copy
1 parent 6147a35 commit 3aae35e

File tree

4 files changed

+61
-20
lines changed

4 files changed

+61
-20
lines changed

src/aind_data_asset_indexer/aind_bucket_indexer.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,8 @@ def _resolve_schema_information(
233233
# with record info if they are different
234234
if is_in_record and is_in_root and is_in_copy_subdir:
235235
self._write_root_file_with_record_info(
236-
docdb_record=docdb_record.get(field_name), **common_kwargs
236+
docdb_record_contents=docdb_record.get(field_name),
237+
**common_kwargs,
237238
)
238239
# If field is not null, a file exists in the root folder, and
239240
# no file exists in copy_subdir, then copy root folder file to
@@ -242,21 +243,35 @@ def _resolve_schema_information(
242243
elif is_in_record and is_in_root and not is_in_copy_subdir:
243244
self._copy_file_from_root_to_subdir(**common_kwargs)
244245
self._write_root_file_with_record_info(
245-
docdb_record=docdb_record.get(field_name), **common_kwargs
246+
docdb_record_contents=docdb_record.get(field_name),
247+
**common_kwargs,
246248
)
247249
# If field is not null, no file exists in the root folder, and
248250
# a file exists in copy_subdir, then create a file in the root
249251
# folder with the record info
250252
elif is_in_record and not is_in_root and is_in_copy_subdir:
251253
self._write_root_file_with_record_info(
252-
docdb_record=docdb_record.get(field_name), **common_kwargs
254+
docdb_record_contents=docdb_record.get(field_name),
255+
**common_kwargs,
253256
)
254257
# If field is not null, no file exists in the root folder, and
255258
# no file exists in copy_subdir, then create a file in the root
256259
# folder with the record info and then copy it to the copy subdir
257260
elif is_in_record and not is_in_root and not is_in_copy_subdir:
258261
self._write_root_file_with_record_info(
259-
docdb_record=docdb_record.get(field_name), **common_kwargs
262+
docdb_record_contents=docdb_record.get(field_name),
263+
**common_kwargs,
264+
)
265+
# Get file info for new file in root folder
266+
object_key = create_object_key(
267+
prefix=prefix, filename=core_schema_file_name
268+
)
269+
common_kwargs["core_schema_info_in_root"] = (
270+
get_dict_of_file_info(
271+
s3_client=s3_client,
272+
bucket=self.job_settings.s3_bucket,
273+
keys=[object_key],
274+
).get(object_key)
260275
)
261276
self._copy_file_from_root_to_subdir(**common_kwargs)
262277
# If field is null, a file exists in the root folder, and
@@ -298,6 +313,11 @@ def _resolve_schema_information(
298313
f"Something went wrong downloading or parsing "
299314
f"s3://{self.job_settings.s3_bucket}/{object_key}"
300315
)
316+
# Can delete corrupt root file since a copy has been made
317+
response = s3_client.delete_object(
318+
Bucket=self.job_settings.s3_bucket, Key=object_key
319+
)
320+
logging.debug(f"{response}")
301321

302322
# If field is null, no file exists in the root folder, and
303323
# a file exists in copy_subdir, then do nothing

src/aind_data_asset_indexer/utils.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -351,11 +351,10 @@ def does_s3_metadata_copy_exist(
351351
Bucket=bucket, Prefix=copy_prefix, Delimiter="/"
352352
)
353353
if "Contents" in response:
354-
core_schemas = [s.rstrip(".json") for s in core_schema_file_names]
355-
pattern = r"([a-zA-Z0-9_]+)\.\d{8}\.json$"
354+
core_schemas = [s.replace(".json", "") for s in core_schema_file_names]
355+
pattern = re.escape(copy_prefix) + r"([a-zA-Z0-9_]+)\.\d{8}\.json$"
356356
for obj in response["Contents"]:
357-
file_name = obj["Key"].lstrip(copy_prefix)
358-
m = re.match(pattern, file_name)
357+
m = re.match(pattern, obj["Key"])
359358
if m is not None and m.group(1) in core_schemas:
360359
return True
361360
return False
@@ -393,11 +392,10 @@ def list_metadata_copies(
393392
)
394393
files = []
395394
if "Contents" in response:
396-
core_schemas = [s.rstrip(".json") for s in core_schema_file_names]
397-
pattern = r"([a-zA-Z0-9_]+)\.\d{8}\.json$"
395+
core_schemas = [s.replace(".json", "") for s in core_schema_file_names]
396+
pattern = re.escape(copy_prefix) + r"([a-zA-Z0-9_]+)\.\d{8}\.json$"
398397
for obj in response["Contents"]:
399-
file_name = obj["Key"].lstrip(copy_prefix)
400-
m = re.match(pattern, file_name)
398+
m = re.match(pattern, obj["Key"])
401399
if m is not None and m.group(1) in core_schemas:
402400
files.append(f"{m.group(1)}.json")
403401
return files

tests/test_aind_bucket_indexer.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ def test_resolve_schema_information_case_1(
243243
)
244244
self.assertEqual(dict(), docdb_fields_to_update)
245245
mock_write_file_with_record_info.assert_called_once_with(
246-
docdb_record=self.example_md_record.get("subject"),
246+
docdb_record_contents=self.example_md_record.get("subject"),
247247
s3_client=mock_s3_client,
248248
prefix="ecephys_642478_2023-01-17_13-56-29",
249249
core_schema_file_name="subject.json",
@@ -316,7 +316,7 @@ def test_resolve_schema_information_case_2(
316316
),
317317
)
318318
mock_write_file_with_record_info.assert_called_once_with(
319-
docdb_record=self.example_md_record.get("subject"),
319+
docdb_record_contents=self.example_md_record.get("subject"),
320320
s3_client=mock_s3_client,
321321
prefix="ecephys_642478_2023-01-17_13-56-29",
322322
core_schema_file_name="subject.json",
@@ -373,7 +373,7 @@ def test_resolve_schema_information_case_3(
373373
self.assertEqual(dict(), docdb_fields_to_update)
374374
mock_copy_file_to_subdir.assert_not_called()
375375
mock_write_file_with_record_info.assert_called_once_with(
376-
docdb_record=self.example_md_record.get("subject"),
376+
docdb_record_contents=self.example_md_record.get("subject"),
377377
s3_client=mock_s3_client,
378378
prefix="ecephys_642478_2023-01-17_13-56-29",
379379
core_schema_file_name="subject.json",
@@ -400,12 +400,14 @@ def test_resolve_schema_information_case_3(
400400
"aind_data_asset_indexer.aind_bucket_indexer.AindIndexBucketJob."
401401
"_write_root_file_with_record_info"
402402
)
403+
@patch("aind_data_asset_indexer.aind_bucket_indexer.get_dict_of_file_info")
403404
@patch(
404405
"aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names",
405406
["subject.json"],
406407
) # Mocking this to limit for loop to one iteration
407408
def test_resolve_schema_information_case_4(
408409
self,
410+
mock_get_dict_of_file_info: MagicMock,
409411
mock_write_file_with_record_info: MagicMock,
410412
mock_copy_file_to_subdir: MagicMock,
411413
mock_download_json_file: MagicMock,
@@ -420,6 +422,18 @@ def test_resolve_schema_information_case_4(
420422
"""
421423

422424
core_schema_info_in_root = dict()
425+
core_schema_info_in_root_after_copy = {
426+
"ecephys_642478_2023-01-17_13-56-29/subject.json": {
427+
"last_modified": datetime(
428+
2024, 5, 15, 17, 41, 28, tzinfo=timezone.utc
429+
),
430+
"e_tag": '"7ce612b2f26be2efe806990cb4eb4266"',
431+
"version_id": "version_id",
432+
}
433+
}
434+
mock_get_dict_of_file_info.return_value = (
435+
core_schema_info_in_root_after_copy
436+
)
423437
docdb_fields_to_update = self.basic_job._resolve_schema_information(
424438
prefix="ecephys_642478_2023-01-17_13-56-29",
425439
s3_client=mock_s3_client,
@@ -429,20 +443,25 @@ def test_resolve_schema_information_case_4(
429443
)
430444
self.assertEqual(dict(), docdb_fields_to_update)
431445
mock_write_file_with_record_info.assert_called_once_with(
432-
docdb_record=self.example_md_record.get("subject"),
446+
docdb_record_contents=self.example_md_record.get("subject"),
433447
s3_client=mock_s3_client,
434448
prefix="ecephys_642478_2023-01-17_13-56-29",
435449
core_schema_file_name="subject.json",
436450
core_schema_info_in_root=core_schema_info_in_root.get(
437451
"subject.json"
438452
),
439453
)
454+
mock_get_dict_of_file_info.assert_called_once_with(
455+
s3_client=mock_s3_client,
456+
bucket=self.basic_job.job_settings.s3_bucket,
457+
keys=["ecephys_642478_2023-01-17_13-56-29/subject.json"],
458+
)
440459
mock_copy_file_to_subdir.assert_called_once_with(
441460
s3_client=mock_s3_client,
442461
prefix="ecephys_642478_2023-01-17_13-56-29",
443462
core_schema_file_name="subject.json",
444-
core_schema_info_in_root=core_schema_info_in_root.get(
445-
"subject.json"
463+
core_schema_info_in_root=core_schema_info_in_root_after_copy.get(
464+
"ecephys_642478_2023-01-17_13-56-29/subject.json"
446465
),
447466
)
448467
mock_download_json_file.assert_not_called()
@@ -643,13 +662,17 @@ def test_resolve_schema_information_case_6_corrupt_download(
643662
"subject.json"
644663
),
645664
)
646-
mock_log_debug.assert_not_called()
647665
mock_log_info.assert_not_called()
648666
mock_log_warn.assert_called_once_with(
649667
"Something went wrong downloading or parsing "
650668
"s3://aind-ephys-data-dev-u5u0i5/"
651669
"ecephys_642478_2023-01-17_13-56-29/subject.json"
652670
)
671+
mock_s3_client.delete_object.assert_called_once_with(
672+
Bucket="aind-ephys-data-dev-u5u0i5",
673+
Key="ecephys_642478_2023-01-17_13-56-29/subject.json",
674+
)
675+
mock_log_debug.assert_called_once()
653676

654677
@patch("boto3.client")
655678
@patch("logging.debug")

tests/test_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ def test_list_metadata_copies(self, mock_s3_client: MagicMock):
521521
copy_subdir="original_metadata",
522522
s3_client=mock_s3_client,
523523
)
524-
self.assertEqual(["subject.json"], contents)
524+
self.assertEqual(["data_description.json", "subject.json"], contents)
525525

526526
@patch("boto3.client")
527527
def test_does_s3_metadata_copy_exist_none(self, mock_s3_client: MagicMock):

0 commit comments

Comments
 (0)