sdpython
diff --git a/‎_unittests/ut_tasks/test_tasks_image_text_to_text.py‎
Lines changed: 4 additions & 3 deletions b/‎_unittests/ut_tasks/test_tasks_image_text_to_text.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎_unittests/ut_torch_models/test_tiny_llms_bypassed.py‎
Lines changed: 0 additions & 7 deletions b/‎_unittests/ut_torch_models/test_tiny_llms_bypassed.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎onnx_diagnostic/helpers/cache_helper.py‎
Lines changed: 24 additions & 9 deletions b/‎onnx_diagnostic/helpers/cache_helper.py‎
Lines changed: 24 additions & 9 deletions
diff --git a/‎onnx_diagnostic/helpers/helper.py‎
Lines changed: 35 additions & 11 deletions b/‎onnx_diagnostic/helpers/helper.py‎
Lines changed: 35 additions & 11 deletions
diff --git a/‎onnx_diagnostic/helpers/torch_helper.py‎
Lines changed: 17 additions & 6 deletions b/‎onnx_diagnostic/helpers/torch_helper.py‎
Lines changed: 17 additions & 6 deletions
@@ -20,7 +20,7 @@ def test_image_text_to_text_idefics(self):
         mid = "HuggingFaceM4/tiny-random-idefics"
         data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
         self.assertEqual(data["task"], "image-text-to-text")
-        self.assertIn((data["size"], data["n_weights"]), [(12742888, 3185722)])
+        self.assertIn((data["size"], data["n_weights"]), [(12628776, 3157194)])
         model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
         model(**torch_deepcopy(inputs))
         model(**data["inputs2"])
@@ -33,10 +33,11 @@ def test_image_text_to_text_idefics(self):
     @requires_transformers("4.53")
     @requires_torch("2.7.99")
     def test_image_text_to_text_gemma3(self):
-        mid = "google/gemma-3-4b-it"
+        # mid = "google/gemma-3-4b-it"
+        mid = "tiny-random/gemma-3"
         data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
         self.assertEqual(data["task"], "image-text-to-text")
-        self.assertIn((data["size"], data["n_weights"]), [(34401152, 8600288)])
+        # self.assertIn((data["size"], data["n_weights"]), [(17248576, 4312144)])
         model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
         print("--", self.string_type(data["inputs"], with_shape=True))
         model(**torch_deepcopy(inputs))
 
@@ -1,17 +1,13 @@
 import copy
 import unittest
 import torch
-from transformers.cache_utils import DynamicCache
 from onnx_diagnostic.ext_test_case import ExtTestCase, ignore_warnings, hide_stdout
 from onnx_diagnostic.torch_models.llms import get_tiny_llm
 from onnx_diagnostic.torch_models.llms import get_phi2
 from onnx_diagnostic.helpers import string_type
 from onnx_diagnostic.helpers.torch_helper import torch_deepcopy
 from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
-from onnx_diagnostic.torch_export_patches.patches.patch_transformers import (
-    patched_DynamicCache,
-)
 
 
 class TestTinyLlmBypassed(ExtTestCase):
@@ -29,9 +25,6 @@ def test_export_tiny_llm_2_bypassed(self):
             patch_torch=False, patch_transformers=True, catch_constraints=False, verbose=10
         ) as modificator:
 
-            for k in patched_DynamicCache._PATCHES_:
-                self.assertEqual(getattr(patched_DynamicCache, k), getattr(DynamicCache, k))
-
             inputs = modificator(copy.deepcopy(inputs))
 
             def debug():
 
@@ -10,6 +10,18 @@
     from transformers.cache_utils import MambaCache
 
 
+class CacheKeyValue:
+    def __init__(self, cache: "Cache"):  # noqa: F821
+        if hasattr(cache, "layers"):
+            self.key_cache = [layer.keys for layer in cache.layers if layer.keys is not None]
+            self.value_cache = [
+                layer.values for layer in cache.layers if layer.values is not None
+            ]
+        else:
+            self.key_cache = cache.key_cache
+            self.value_cache = cache.value_cache
+
+
 def flatten_unflatten_for_dynamic_shapes(
     obj: Any,
     use_dict: bool = False,
@@ -221,19 +233,20 @@ def __init__(self):
         ),
     )
     cache = transformers.cache_utils.StaticCache(
-        _config(),
+        config=_config(),
         max_batch_size=key_value_pairs[0][0].shape[0],
         device=key_value_pairs[0][0].device,
         dtype=key_value_pairs[0][0].dtype,
         max_cache_len=max_cache_len,
     )
+    ca = CacheKeyValue(cache)
     for i in range(len(key_value_pairs)):
         assert (
             key_value_pairs[i][0].shape == key_value_pairs[i][1].shape
         ), f"Shape mismatch {key_value_pairs[i][0].shape} != {key_value_pairs[i][1].shape}"
         d = key_value_pairs[i][1].shape[2]
-        cache.key_cache[i][:, :, :d, :] = key_value_pairs[i][0]
-        cache.value_cache[i][:, :, :d, :] = key_value_pairs[i][1]
+        ca.key_cache[i][:, :, :d, :] = key_value_pairs[i][0]
+        ca.value_cache[i][:, :, :d, :] = key_value_pairs[i][1]
     return cache
 
 
@@ -300,23 +313,24 @@ def __init__(self):
             self.sliding_window = key_value_pairs[0][0].shape[2]
 
     cache = transformers.cache_utils.SlidingWindowCache(
-        _config(),
+        config=_config(),
         max_batch_size=key_value_pairs[0][0].shape[0],
         max_cache_len=key_value_pairs[0][0].shape[2],  # same as sliding_window
         device=key_value_pairs[0][0].device,
         dtype=key_value_pairs[0][0].dtype,
     )
+    ca = CacheKeyValue(cache)
     for i in range(len(key_value_pairs)):
-        assert cache.key_cache[i].shape == key_value_pairs[i][0].shape, (
+        assert ca.key_cache[i].shape == key_value_pairs[i][0].shape, (
             f"Shape mismatch, expected {cache.key_cache[i].shape}, "
             f"got {key_value_pairs[i][0].shape}"
         )
-        cache.key_cache[i][:, :, :, :] = key_value_pairs[i][0]
-        assert cache.value_cache[i].shape == key_value_pairs[i][1].shape, (
+        ca.key_cache[i][:, :, :, :] = key_value_pairs[i][0]
+        assert ca.value_cache[i].shape == key_value_pairs[i][1].shape, (
             f"Shape mismatch, expected {cache.value_cache[i].shape}, "
             f"got {key_value_pairs[i][1].shape}"
         )
-        cache.value_cache[i][:, :, :, :] = key_value_pairs[i][1]
+        ca.value_cache[i][:, :, :, :] = key_value_pairs[i][1]
     return cache
 
 
@@ -373,9 +387,10 @@ class _config:
         num_heads = key_value_pairs[0][0].shape[1] if key_value_pairs else None
         head_dim = key_value_pairs[0][0].shape[-1] if key_value_pairs else None
         num_attention_heads = key_value_pairs[0][1].shape[1] if key_value_pairs else None
+        num_hidden_layers = len(key_value_pairs)
 
     cache = transformers.cache_utils.HybridCache(
-        _config(), max_cache_len=max_cache_len, max_batch_size=max_batch_size
+        config=_config(), max_cache_len=max_cache_len, max_batch_size=max_batch_size
     )
     for i, (key, value) in enumerate(key_value_pairs):
         cache.update(key, value, i)
 
@@ -564,16 +564,19 @@ def string_type(
         "StaticCache",
         "HybridCache",
     }:
+        from .cache_helper import CacheKeyValue
+
+        ca = CacheKeyValue(obj)
         kc = string_type(
-            list(obj.key_cache),
+            ca.key_cache,
             with_shape=with_shape,
             with_min_max=with_min_max,
             with_device=with_device,
             limit=limit,
             verbose=verbose,
         )
         vc = string_type(
-            list(obj.value_cache),
+            ca.value_cache,
             with_shape=with_shape,
             with_min_max=with_min_max,
             with_device=with_device,
@@ -1471,17 +1474,24 @@ def max_diff(
     # backup function in case pytorch does not know how to serialize.
     if expected.__class__.__name__ == "HybridCache":
         if got.__class__.__name__ == "HybridCache":
+            from .cache_helper import CacheKeyValue
+
             if verbose >= 6:
                 print(f"[max_diff] HybridCache: {string_type(expected)} ? {string_type(got)}")
+            cae = CacheKeyValue(expected)
+            cag = CacheKeyValue(got)
             return max_diff(
-                [expected.key_cache, expected.value_cache],
-                [got.key_cache, got.value_cache],
+                [cae.key_cache, cae.value_cache],
+                [cag.key_cache, cag.value_cache],
                 verbose=verbose,
                 hist=hist,
             )
         if isinstance(got, tuple) and len(got) == 2:
+            from .cache_helper import CacheKeyValue
+
+            cae = CacheKeyValue(expected)
             return max_diff(
-                [expected.key_cache, expected.value_cache],
+                [cae.key_cache, cae.value_cache],
                 [got[0], got[1]],
                 debug_info=_debug(expected.__class__.__name__),
                 **_dkws,
@@ -1495,17 +1505,24 @@ def max_diff(
 
     if expected.__class__.__name__ == "StaticCache":
         if got.__class__.__name__ == "StaticCache":
+            from .cache_helper import CacheKeyValue
+
+            cae = CacheKeyValue(expected)
+            cag = CacheKeyValue(got)
             if verbose >= 6:
                 print(f"[max_diff] StaticCache: {string_type(expected)} ? {string_type(got)}")
             return max_diff(
-                [expected.key_cache, expected.value_cache],
-                [got.key_cache, got.value_cache],
+                [cae.key_cache, cae.value_cache],
+                [cag.key_cache, cag.value_cache],
                 verbose=verbose,
                 hist=hist,
             )
         if isinstance(got, tuple) and len(got) == 2:
+            from .cache_helper import CacheKeyValue
+
+            cae = CacheKeyValue(expected)
             return max_diff(
-                [expected.key_cache, expected.value_cache],
+                [cae.key_cache, cae.value_cache],
                 [got[0], got[1]],
                 debug_info=_debug(expected.__class__.__name__),
                 **_dkws,
@@ -1524,15 +1541,22 @@ def max_diff(
                     f"[max_diff] SlidingWindowCache: "
                     f"{string_type(expected)} ? {string_type(got)}"
                 )
+            from .cache_helper import CacheKeyValue
+
+            cae = CacheKeyValue(expected)
+            cag = CacheKeyValue(got)
             return max_diff(
-                [expected.key_cache, expected.value_cache],
-                [got.key_cache, got.value_cache],
+                [cae.key_cache, cae.value_cache],
+                [cag.key_cache, cag.value_cache],
                 verbose=verbose,
                 hist=hist,
             )
         if isinstance(got, tuple) and len(got) == 2:
+            from .cache_helper import CacheKeyValue
+
+            cae = CacheKeyValue(expected)
             return max_diff(
-                [expected.key_cache, expected.value_cache],
+                [cae.key_cache, cae.value_cache],
                 [got[0], got[1]],
                 debug_info=_debug(expected.__class__.__name__),
                 **_dkws,
 
@@ -782,19 +782,30 @@ def torch_deepcopy(value: Any) -> Any:
     if hasattr(value, "clone"):
         return value.clone()
     if value.__class__.__name__ == "DynamicCache":
-        return make_dynamic_cache(
-            torch_deepcopy(list(zip(value.key_cache, value.value_cache)))
-        )
+        from .cache_helper import CacheKeyValue
+
+        ca = CacheKeyValue(value)
+        return make_dynamic_cache(torch_deepcopy(list(zip(ca.key_cache, ca.value_cache))))
     if value.__class__.__name__ == "StaticCache":
+        from .cache_helper import CacheKeyValue
+
+        ca = CacheKeyValue(value)
         return make_static_cache(
-            torch_deepcopy(list(zip(value.key_cache, value.value_cache))),
+            torch_deepcopy(list(zip(ca.key_cache, ca.value_cache))),
             max_cache_len=value.max_cache_len,
         )
     if value.__class__.__name__ == "HybridCache":
-        return make_hybrid_cache(torch_deepcopy(list(zip(value.key_cache, value.value_cache))))
+        from .cache_helper import CacheKeyValue
+
+        ca = CacheKeyValue(value)
+        return make_hybrid_cache(torch_deepcopy(list(zip(ca.key_cache, ca.value_cache))))
     if value.__class__.__name__ == "SlidingWindowCache":
+        from .cache_helper import CacheKeyValue
+
+        ca = CacheKeyValue(value)
+        return make_hybrid_cache(torch_deepcopy(list(zip(ca.key_cache, ca.value_cache))))
         return make_sliding_window_cache(
-            torch_deepcopy(list(zip(value.key_cache, value.value_cache)))
+            torch_deepcopy(list(zip(ca.key_cache, ca.value_cache)))
         )
     if value.__class__.__name__ == "EncoderDecoderCache":
         return make_encoder_decoder_cache(