fix conversion issues

xadupre · xadupre · commit 400fb24b1516 · 2025-08-20T19:08:50.000+02:00
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -5,6 +5,8 @@ Change Logs
 +++++
 
 
+* :pr:`200`: fixes patches for 4.55.1+, DynamicCache is no longer registered by default,
+  this code moved to executorch.py in transformers
 * :pr:`199`: delete hidden_size and num_attention_heads modification in a config
 * :pr:`198`: support gpt-oss
 * :pr:`197`: updates CI for torch 2.8
diff --git a/_unittests/ut_helpers/test_bench_run.py b/_unittests/ut_helpers/test_bench_run.py
@@ -10,7 +10,7 @@
     make_configs,
     run_benchmark,
 )
-from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache
+from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache, CacheKeyValue
 
 
 class TestBenchRun(ExtTestCase):
@@ -153,9 +153,10 @@ def test_max_diff(self):
     def test_max_diff_dynamic_cache(self):
         t1 = torch.tensor([0, 1], dtype=torch.float32)
         cache = make_dynamic_cache([(torch.ones((2, 2)), (torch.ones((2, 2)) * 2))])
+        dc = CacheKeyValue(cache)
         md = max_diff(
             (t1, cache),
-            (t1, cache.key_cache[0], cache.value_cache[0]),
+            (t1, dc.key_cache[0], dc.value_cache[0]),
             flatten=True,
             verbose=10,
         )
diff --git a/_unittests/ut_torch_export_patches/test_patch_serialization_transformers.py b/_unittests/ut_torch_export_patches/test_patch_serialization_transformers.py
@@ -8,6 +8,7 @@
     make_static_cache,
     make_sliding_window_cache,
     flatten_unflatten_for_dynamic_shapes,
+    CacheKeyValue,
 )
 from onnx_diagnostic.torch_export_patches.onnx_export_errors import (
     torch_export_patches,
@@ -48,7 +49,8 @@ def test_encoder_decoder_cache_deepcopy(self):
     def test_encoder_decoder_cache_export(self):
         class Model(torch.nn.Module):
             def forward(self, cache):
-                return cache.self_attention_cache.key_cache[0]
+                att = CacheKeyValue(cache.self_attention_cache)
+                return att.key_cache[0]
 
         cache1 = make_dynamic_cache(
             [(torch.randn(2, 4, 3, 7), torch.randn(2, 4, 3, 7)) for i in range(3)]
@@ -88,6 +90,7 @@ def test_dynamic_cache_flatten(self):
     def test_dynamic_cache_export(self):
         class Model(torch.nn.Module):
             def forward(self, cache):
+                cache = CacheKeyValue(cache)
                 return cache.key_cache[0]
 
         cache = make_dynamic_cache(
@@ -180,7 +183,8 @@ def test_base_sliding_window_cache_unflatten_flatten(self):
     def test_sliding_window_cache_export(self):
         class Model(torch.nn.Module):
             def forward(self, cache):
-                return cache.key_cache[0]
+                dc = CacheKeyValue(cache)
+                return dc.key_cache[0]
 
         cache = make_sliding_window_cache(
             [
@@ -268,6 +272,7 @@ def test_static_cache(self):
         # export
         class Model(torch.nn.Module):
             def forward(self, cache):
+                cache = CacheKeyValue(cache)
                 return cache.key_cache[0]
 
         model = Model()
diff --git a/_unittests/ut_torch_models/test_tiny_llms_onnx.py b/_unittests/ut_torch_models/test_tiny_llms_onnx.py
@@ -107,6 +107,8 @@ def test_bypass_onnx_export_tiny_llm_official_full(self):
         self.assertEqual(
             {"attention_mask", "past_key_values", "input_ids", "position_ids"}, set(inputs)
         )
+        print("***", self.string_type(inputs, with_shape=True))
+        print("---", type(model))
         with torch_export_patches(
             patch_transformers=True, verbose=1, stop_if_static=1
         ) as modificator:
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_torch.py b/onnx_diagnostic/torch_export_patches/patches/patch_torch.py
@@ -27,8 +27,8 @@ def _catch_produce_guards_and_solve_constraints(
     dynamic_shapes: Union[Dict[str, Any], Tuple[Any], List[Any], None],
     equalities_inputs: "EqualityConstraint",  # noqa: F821
     original_signature: inspect.Signature,
-    _is_torch_jit_trace: bool = False,
     verbose: int = 0,
+    **kwargs,
 ):
     try:
         return previous_function(
@@ -37,7 +37,7 @@ def _catch_produce_guards_and_solve_constraints(
             dynamic_shapes=dynamic_shapes,
             equalities_inputs=equalities_inputs,
             original_signature=original_signature,
-            _is_torch_jit_trace=_is_torch_jit_trace,
+            **kwargs,
         )
     except Exception as e:
         if not int(os.environ.get("SKIP_SOLVE_CONSTRAINTS", "1")):
@@ -51,7 +51,7 @@ def _catch_produce_guards_and_solve_constraints(
                 f"dynamic_shapes={dynamic_shapes}\n"
                 f"equalities_inputs={equalities_inputs}\n"
                 f"original_signature={original_signature}\n"
-                f"_is_torch_jit_trace={_is_torch_jit_trace}\n"
+                f"kwargs={kwargs}\n"
                 f"exc={e}\ngm={gm}"
             )
         torch._dynamo.reset()
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -24,6 +24,14 @@
     patch_masking_utils = False
 
 
+try:
+    # transformers>= 4.55.1
+    from transformers.cache_utils import DynamicLayer
+
+    patch_DynamicLayer = hasattr(DynamicLayer, "lazy_initialization")
+except ImportError:
+    patch_DynamicLayer = False
+
 from ...ext_test_case import has_transformers
 from ...helpers.torch_helper import is_torchdynamo_exporting
 
@@ -158,6 +166,20 @@ def patched_parse_processor_args(
         return processor_kwargs, remaining_kwargs
 
 
+if patch_DynamicLayer:
+
+    class patched_DynamicLayer:
+        _PATCHES_ = ["lazy_initialization"]
+        _PATCHED_CLASS_ = DynamicLayer
+
+        def lazy_initialization(self, key_states: torch.Tensor):
+            self.dtype, self.device = key_states.dtype, key_states.device
+            new_shape = list(key_states.shape)
+            new_shape[-2] = 0
+            self.keys = torch.empty(new_shape, dtype=self.dtype, device=self.device)
+            self.values = torch.empty(new_shape, dtype=self.dtype, device=self.device)
+
+
 def _patch_make_causal_mask(
     input_ids_shape: torch.Size,
     dtype: torch.dtype,
@@ -324,6 +346,14 @@ def update(
                     self.key_cache[layer_idx] = key_states
                     self.value_cache[layer_idx] = value_states
                 else:
+                    torch._check(
+                        len(self.key_cache[layer_idx].shape) == len(key_states.shape),
+                        lambda: (
+                            f"Rank mismatch len(self.key_cache[layer_idx].shape)="
+                            f"{len(self.key_cache[layer_idx].shape)}, "
+                            f"len(key_states.shape)={len(key_states.shape)}"
+                        ),
+                    )
                     self.key_cache[layer_idx] = torch.cat(
                         [self.key_cache[layer_idx], key_states], dim=-2
                     )

Original file line number	Diff line number	Diff line change
`@@ -107,6 +107,8 @@ def test_bypass_onnx_export_tiny_llm_official_full(self):`
`107`	`107`	`self.assertEqual(`
`108`	`108`	`{"attention_mask", "past_key_values", "input_ids", "position_ids"}, set(inputs)`
`109`	`109`	`)`
	`110`	`+ print("***", self.string_type(inputs, with_shape=True))`
	`111`	`+ print("---", type(model))`
`110`	`112`	`with torch_export_patches(`
`111`	`113`	`patch_transformers=True, verbose=1, stop_if_static=1`
`112`	`114`	`) as modificator:`