Skip to content

Commit 40f7573

Browse files
committed
fix: Reader caching
1 parent 02979d1 commit 40f7573

File tree

3 files changed

+367
-30
lines changed

3 files changed

+367
-30
lines changed

src/c2pa/c2pa.py

Lines changed: 43 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,6 +1358,10 @@ def __init__(self,
13581358
# we may have opened ourselves, and that we need to close later
13591359
self._backing_file = None
13601360

1361+
# Cache for manifest JSON string and parsed data to avoid multiple calls
1362+
self._manifest_json_str_cache = None
1363+
self._manifest_data_cache = None
1364+
13611365
if stream is None:
13621366
# If we don't get a stream as param:
13631367
# Create a stream from the file path in format_or_path
@@ -1620,6 +1624,9 @@ def close(self):
16201624
Reader._ERROR_MESSAGES['cleanup_error'].format(
16211625
str(e)))
16221626
finally:
1627+
# Clear the cache when closing
1628+
self._manifest_json_str_cache = None
1629+
self._manifest_data_cache = None
16231630
self._closed = True
16241631

16251632
def json(self) -> str:
@@ -1634,6 +1641,10 @@ def json(self) -> str:
16341641

16351642
self._ensure_valid_state()
16361643

1644+
# Return cached result if available
1645+
if self._manifest_json_str_cache is not None:
1646+
return self._manifest_json_str_cache
1647+
16371648
result = _lib.c2pa_reader_json(self._reader)
16381649

16391650
if result is None:
@@ -1642,7 +1653,29 @@ def json(self) -> str:
16421653
raise C2paError(error)
16431654
raise C2paError("Error during manifest parsing in Reader")
16441655

1645-
return _convert_to_py_string(result)
1656+
# Cache the result and return it
1657+
self._manifest_json_str_cache = _convert_to_py_string(result)
1658+
return self._manifest_json_str_cache
1659+
1660+
def _get_cached_manifest_data(self) -> dict:
1661+
"""Get the cached manifest data, fetching and parsing if not cached.
1662+
1663+
Returns:
1664+
A dictionary containing the parsed manifest data
1665+
1666+
Raises:
1667+
C2paError: If there was an error getting or parsing the JSON
1668+
"""
1669+
if self._manifest_data_cache is None:
1670+
if self._manifest_json_str_cache is None:
1671+
self._manifest_json_str_cache = self.json()
1672+
1673+
try:
1674+
self._manifest_data_cache = json.loads(self._manifest_json_str_cache)
1675+
except json.JSONDecodeError as e:
1676+
raise C2paError(f"Failed to parse manifest JSON: {str(e)}") from e
1677+
1678+
return self._manifest_data_cache
16461679

16471680
def get_active_manifest(self) -> dict:
16481681
"""Get the active manifest from the manifest store.
@@ -1658,12 +1691,8 @@ def get_active_manifest(self) -> dict:
16581691
C2paError: If there was an error getting the manifest JSON
16591692
KeyError: If the active_manifest key is missing from the JSON
16601693
"""
1661-
# Self-read to get the JSON
1662-
manifest_json_str = self.json()
1663-
try:
1664-
manifest_data = json.loads(manifest_json_str)
1665-
except json.JSONDecodeError as e:
1666-
raise C2paError(f"Failed to parse manifest JSON: {str(e)}") from e
1694+
# Get cached manifest data
1695+
manifest_data = self._get_cached_manifest_data()
16671696

16681697
# Get the active manfiest id/label
16691698
if "active_manifest" not in manifest_data:
@@ -1681,7 +1710,7 @@ def get_active_manifest(self) -> dict:
16811710

16821711
return manifests[active_manifest_id]
16831712

1684-
def get_manifest_by_label(self, label: str) -> dict:
1713+
def get_manifest_from_label(self, label: str) -> dict:
16851714
"""Get a specific manifest from the manifest store by its label.
16861715
16871716
This method retrieves the manifest JSON and extracts the manifest
@@ -1697,12 +1726,8 @@ def get_manifest_by_label(self, label: str) -> dict:
16971726
C2paError: If there was an error getting the manifest JSON
16981727
KeyError: If the manifests key is missing from the JSON
16991728
"""
1700-
# Self-read to get the JSON
1701-
manifest_json_str = self.json()
1702-
try:
1703-
manifest_data = json.loads(manifest_json_str)
1704-
except json.JSONDecodeError as e:
1705-
raise C2paError(f"Failed to parse manifest JSON: {str(e)}") from e
1729+
# Get cached manifest data
1730+
manifest_data = self._get_cached_manifest_data()
17061731

17071732
if "manifests" not in manifest_data:
17081733
raise KeyError("No 'manifests' key found in manifest data")
@@ -1727,12 +1752,8 @@ def get_validation_state(self) -> Optional[str]:
17271752
Raises:
17281753
C2paError: If there was an error getting the manifest JSON
17291754
"""
1730-
manifest_json_str = self.json()
1731-
try:
1732-
# Self-read to get the JSON
1733-
manifest_data = json.loads(manifest_json_str)
1734-
except json.JSONDecodeError as e:
1735-
raise C2paError(f"Failed to parse manifest JSON: {str(e)}") from e
1755+
# Get cached manifest data
1756+
manifest_data = self._get_cached_manifest_data()
17361757

17371758
return manifest_data.get("validation_state")
17381759

@@ -1751,12 +1772,8 @@ def get_validation_results(self) -> Optional[dict]:
17511772
Raises:
17521773
C2paError: If there was an error getting the manifest JSON
17531774
"""
1754-
manifest_json_str = self.json()
1755-
try:
1756-
# Self-read to get the JSON
1757-
manifest_data = json.loads(manifest_json_str)
1758-
except json.JSONDecodeError as e:
1759-
raise C2paError(f"Failed to parse manifest JSON: {str(e)}") from e
1775+
# Get cached manifest data
1776+
manifest_data = self._get_cached_manifest_data()
17601777

17611778
return manifest_data.get("validation_results")
17621779

tests/test_unit_tests.py

Lines changed: 174 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,13 @@ def test_get_active_manifest(self):
7373
expected_label = "contentauth:urn:uuid:c85a2b90-f1a0-4aa4-b17f-f938b475804e"
7474
self.assertEqual(active_manifest["label"], expected_label)
7575

76-
def test_get_manifest_by_label(self):
76+
def test_get_manifest_from_label(self):
7777
with open(self.testPath, "rb") as file:
7878
reader = Reader("image/jpeg", file)
7979

8080
# Test getting manifest by the specific label
8181
label = "contentauth:urn:uuid:c85a2b90-f1a0-4aa4-b17f-f938b475804e"
82-
manifest = reader.get_manifest_by_label(label)
82+
manifest = reader.get_manifest_from_label(label)
8383
self.assertEqual(manifest["label"], label)
8484

8585
# It should be the active manifest too, so cross-check
@@ -92,7 +92,7 @@ def test_stream_get_non_active_manifest_by_label(self):
9292
reader = Reader("video/mp4", file)
9393

9494
non_active_label = "urn:uuid:54281c07-ad34-430e-bea5-112a18facf0b"
95-
non_active_manifest = reader.get_manifest_by_label(non_active_label)
95+
non_active_manifest = reader.get_manifest_from_label(non_active_label)
9696
self.assertEqual(non_active_manifest["label"], non_active_label)
9797

9898
# Verify it's not the active manifest
@@ -109,7 +109,7 @@ def test_stream_get_non_active_manifest_by_label_not_found(self):
109109
# Try to get a manifest with a label that clearly doesn't exist...
110110
non_existing_label = "urn:uuid:clearly-not-existing"
111111
with self.assertRaises(KeyError):
112-
reader.get_manifest_by_label(non_existing_label)
112+
reader.get_manifest_from_label(non_existing_label)
113113

114114
def test_stream_read_get_validation_state(self):
115115
with open(self.testPath, "rb") as file:
@@ -336,6 +336,115 @@ def test_read_all_files_using_extension(self):
336336
except Exception as e:
337337
self.fail(f"Failed to read metadata from {filename}: {str(e)}")
338338

339+
def test_read_cached_all_files(self):
340+
"""Test reading C2PA metadata with cache functionality from all files in the fixtures/files-for-reading-tests directory"""
341+
reading_dir = os.path.join(self.data_dir, "files-for-reading-tests")
342+
343+
# Map of file extensions to MIME types
344+
mime_types = {
345+
'.jpg': 'image/jpeg',
346+
'.jpeg': 'image/jpeg',
347+
'.png': 'image/png',
348+
'.gif': 'image/gif',
349+
'.webp': 'image/webp',
350+
'.heic': 'image/heic',
351+
'.heif': 'image/heif',
352+
'.avif': 'image/avif',
353+
'.tif': 'image/tiff',
354+
'.tiff': 'image/tiff',
355+
'.mp4': 'video/mp4',
356+
'.avi': 'video/x-msvideo',
357+
'.mp3': 'audio/mpeg',
358+
'.m4a': 'audio/mp4',
359+
'.wav': 'audio/wav',
360+
'.pdf': 'application/pdf',
361+
}
362+
363+
# Skip system files
364+
skip_files = {
365+
'.DS_Store'
366+
}
367+
368+
for filename in os.listdir(reading_dir):
369+
if filename in skip_files:
370+
continue
371+
372+
file_path = os.path.join(reading_dir, filename)
373+
if not os.path.isfile(file_path):
374+
continue
375+
376+
# Get file extension and corresponding MIME type
377+
_, ext = os.path.splitext(filename)
378+
ext = ext.lower()
379+
if ext not in mime_types:
380+
continue
381+
382+
mime_type = mime_types[ext]
383+
384+
try:
385+
with open(file_path, "rb") as file:
386+
reader = Reader(mime_type, file)
387+
388+
# Test 1: Verify cache variables are initially None
389+
self.assertIsNone(reader._manifest_json_str_cache, f"JSON cache should be None initially for {filename}")
390+
self.assertIsNone(reader._manifest_data_cache, f"Manifest data cache should be None initially for {filename}")
391+
392+
# Test 2: Multiple calls to json() should return the same result and use cache
393+
json_data_1 = reader.json()
394+
self.assertIsNotNone(reader._manifest_json_str_cache, f"JSON cache not set after first json() call for {filename}")
395+
self.assertEqual(json_data_1, reader._manifest_json_str_cache, f"JSON cache doesn't match return value for {filename}")
396+
397+
json_data_2 = reader.json()
398+
self.assertEqual(json_data_1, json_data_2, f"JSON inconsistency for {filename}")
399+
self.assertIsInstance(json_data_1, str)
400+
401+
# Test 3: Test methods that use the cache
402+
try:
403+
# Test get_active_manifest() which uses _get_cached_manifest_data()
404+
active_manifest = reader.get_active_manifest()
405+
self.assertIsInstance(active_manifest, dict, f"Active manifest not dict for {filename}")
406+
407+
# Test 4: Verify cache is set after calling cache-using methods
408+
self.assertIsNotNone(reader._manifest_json_str_cache, f"JSON cache not set after get_active_manifest for {filename}")
409+
self.assertIsNotNone(reader._manifest_data_cache, f"Manifest data cache not set after get_active_manifest for {filename}")
410+
411+
# Test 5: Multiple calls to cache-using methods should return the same result
412+
active_manifest_2 = reader.get_active_manifest()
413+
self.assertEqual(active_manifest, active_manifest_2, f"Active manifest cache inconsistency for {filename}")
414+
415+
# Test get_validation_state() which uses the cache
416+
validation_state = reader.get_validation_state()
417+
# validation_state can be None, so just check it doesn't crash
418+
419+
# Test get_validation_results() which uses the cache
420+
validation_results = reader.get_validation_results()
421+
# validation_results can be None, so just check it doesn't crash
422+
423+
# Test 6: Multiple calls to validation methods should return the same result
424+
validation_state_2 = reader.get_validation_state()
425+
self.assertEqual(validation_state, validation_state_2, f"Validation state cache inconsistency for {filename}")
426+
427+
validation_results_2 = reader.get_validation_results()
428+
self.assertEqual(validation_results, validation_results_2, f"Validation results cache inconsistency for {filename}")
429+
430+
except KeyError as e:
431+
# Some files might not have active manifests or validation data
432+
# This is expected for some test files, so we'll skip cache testing for those
433+
pass
434+
435+
# Test 7: Verify the manifest contains expected fields
436+
manifest = json.loads(json_data_1)
437+
self.assertIn("manifests", manifest)
438+
self.assertIn("active_manifest", manifest)
439+
440+
# Test 8: Test cache clearing on close
441+
reader.close()
442+
self.assertIsNone(reader._manifest_json_str_cache, f"JSON cache not cleared for {filename}")
443+
self.assertIsNone(reader._manifest_data_cache, f"Manifest data cache not cleared for {filename}")
444+
445+
except Exception as e:
446+
self.fail(f"Failed to read cached metadata from {filename}: {str(e)}")
447+
339448
def test_reader_context_manager_with_exception(self):
340449
"""Test Reader state after exception in context manager."""
341450
try:
@@ -496,6 +605,67 @@ def test_reader_get_remote_url(self):
496605
self.assertEqual(remote_url, "https://cai-manifests.adobe.com/manifests/adobe-urn-uuid-5f37e182-3687-462e-a7fb-573462780391")
497606
self.assertFalse(reader.is_embedded())
498607

608+
def test_stream_read_and_parse_cached(self):
609+
"""Test reading and parsing with cache verification by repeating operations multiple times"""
610+
with open(self.testPath, "rb") as file:
611+
reader = Reader("image/jpeg", file)
612+
613+
# Verify cache starts as None
614+
self.assertIsNone(reader._manifest_json_str_cache, "JSON cache should be None initially")
615+
self.assertIsNone(reader._manifest_data_cache, "Manifest data cache should be None initially")
616+
617+
# First operation - should populate cache
618+
manifest_store_1 = json.loads(reader.json())
619+
title_1 = manifest_store_1["manifests"][manifest_store_1["active_manifest"]]["title"]
620+
self.assertEqual(title_1, DEFAULT_TEST_FILE_NAME)
621+
622+
# Verify cache is populated after first json() call
623+
self.assertIsNotNone(reader._manifest_json_str_cache, "JSON cache should be set after first json() call")
624+
self.assertEqual(manifest_store_1, json.loads(reader._manifest_json_str_cache), "Cached JSON should match parsed result")
625+
626+
# Repeat the same operation multiple times to verify cache usage
627+
for i in range(5):
628+
manifest_store = json.loads(reader.json())
629+
title = manifest_store["manifests"][manifest_store["active_manifest"]]["title"]
630+
self.assertEqual(title, DEFAULT_TEST_FILE_NAME, f"Title should be consistent on iteration {i+1}")
631+
632+
# Verify cache is still populated and consistent
633+
self.assertIsNotNone(reader._manifest_json_str_cache, f"JSON cache should remain set on iteration {i+1}")
634+
self.assertEqual(manifest_store, json.loads(reader._manifest_json_str_cache), f"Cached JSON should match parsed result on iteration {i+1}")
635+
636+
# Test methods that use the cache
637+
# Test get_active_manifest() which uses _get_cached_manifest_data()
638+
active_manifest_1 = reader.get_active_manifest()
639+
self.assertIsInstance(active_manifest_1, dict, "Active manifest should be a dict")
640+
641+
# Verify manifest data cache is populated
642+
self.assertIsNotNone(reader._manifest_data_cache, "Manifest data cache should be set after get_active_manifest()")
643+
644+
# Repeat get_active_manifest() multiple times to verify cache usage
645+
for i in range(3):
646+
active_manifest = reader.get_active_manifest()
647+
self.assertEqual(active_manifest_1, active_manifest, f"Active manifest should be consistent on iteration {i+1}")
648+
649+
# Verify cache remains populated
650+
self.assertIsNotNone(reader._manifest_data_cache, f"Manifest data cache should remain set on iteration {i+1}")
651+
652+
# Test get_validation_state() and get_validation_results() with cache
653+
validation_state_1 = reader.get_validation_state()
654+
validation_results_1 = reader.get_validation_results()
655+
656+
# Repeat validation methods to verify cache usage
657+
for i in range(3):
658+
validation_state = reader.get_validation_state()
659+
validation_results = reader.get_validation_results()
660+
661+
self.assertEqual(validation_state_1, validation_state, f"Validation state should be consistent on iteration {i+1}")
662+
self.assertEqual(validation_results_1, validation_results, f"Validation results should be consistent on iteration {i+1}")
663+
664+
# Verify cache clearing on close
665+
reader.close()
666+
self.assertIsNone(reader._manifest_json_str_cache, "JSON cache should be cleared on close")
667+
self.assertIsNone(reader._manifest_data_cache, "Manifest data cache should be cleared on close")
668+
499669
# TODO: Unskip once fixed configuration to read data is clarified
500670
# def test_read_cawg_data_file(self):
501671
# """Test reading C2PA metadata from C_with_CAWG_data.jpg file."""

0 commit comments

Comments
 (0)