fix cache

xadupre · xadupre · commit f92a334c9cf3 · 2025-08-21T15:09:13.000+02:00
diff --git a/_unittests/ut_torch_export_patches/test_dynamic_class.py b/_unittests/ut_torch_export_patches/test_dynamic_class.py
@@ -8,7 +8,6 @@
     ignore_warnings,
     hide_stdout,
     requires_torch,
-    has_transformers,
 )
 from onnx_diagnostic.helpers import string_type
 from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache, CacheKeyValue
@@ -22,76 +21,72 @@ class TestOnnxExportErrors(ExtTestCase):
     @ignore_warnings(UserWarning)
     @hide_stdout()
     def test_export_dynamic_cache_update(self):
-        values = [True, False] if has_transformers("4.50") else [False]
-        for strict in self.subloop(values, verbose=1):
-
-            class SubModelCache(torch.nn.Module):
-                def forward(self, cache):
-                    cc = CacheKeyValue(cache)
-                    # If not patched...
-                    # Fails with transformers>=4.54 because function ``parse_processor_args``
-                    # relies in inspect and the exporter is not very fond of that.
-                    # torch._dynamo.exc.Unsupported: id() with unsupported args
-                    # Explanation: Dynamo doesn't know how to trace id()
-                    # call with args
-                    # (GetAttrVariable(ConstantVariable(NoneType: None), __init__),)
-                    # Hint: Supported args are Tensors, and functions/nn.Modules/user-defined
-                    # objects from outside the compiled region.
-                    # Hint: It may be possible to write Dynamo tracing rules for this code.
-                    d = cache.__class__()
-                    d.update(cc.key_cache[0] + 1, cc.value_cache[0] + 2, 0)
-                    d.update(cc.key_cache[0] + 3, cc.value_cache[0] + 5, 1)
-                    return d
-
-            class SubModel(torch.nn.Module):
-                def forward(self, x, cache):
-                    cc = CacheKeyValue(cache)
-                    return x + cc.key_cache[0] + cc.value_cache[0]
-
-            class Model(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-                    self.sub = SubModel()
-                    self.subcache = SubModelCache()
-
-                def forward(self, x, cache):
-                    return self.sub(x, self.subcache(cache))
-
-            # no patch
-            cache = make_dynamic_cache(
-                [(torch.ones((5, 6, 5, 6)), torch.ones((5, 6, 5, 6)) + 2)]
+        class SubModelCache(torch.nn.Module):
+            def forward(self, cache):
+                cc = CacheKeyValue(cache)
+                # If not patched...
+                # Fails with transformers>=4.54 because function ``parse_processor_args``
+                # relies in inspect and the exporter is not very fond of that.
+                # torch._dynamo.exc.Unsupported: id() with unsupported args
+                # Explanation: Dynamo doesn't know how to trace id()
+                # call with args
+                # (GetAttrVariable(ConstantVariable(NoneType: None), __init__),)
+                # Hint: Supported args are Tensors, and functions/nn.Modules/user-defined
+                # objects from outside the compiled region.
+                # Hint: It may be possible to write Dynamo tracing rules for this code.
+                d = cache.__class__()
+                d.update(cc.key_cache[0] + 1, cc.value_cache[0] + 2, 0)
+                d.update(cc.key_cache[0] + 3, cc.value_cache[0] + 5, 1)
+                return d
+
+        class SubModel(torch.nn.Module):
+            def forward(self, x, cache):
+                cc = CacheKeyValue(cache)
+                y = cc.key_cache[0] + cc.value_cache[0]
+                return x + y
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.sub = SubModel()
+                self.subcache = SubModelCache()
+
+            def forward(self, x, cache):
+                return self.sub(x, self.subcache(cache))
+
+        # no patch
+        cache = make_dynamic_cache([(torch.ones((5, 6, 5, 6)), torch.ones((5, 6, 5, 6)) + 2)])
+        model = Model()
+        inputs = (torch.randn((5, 6, 5, 6)), cache)
+        expected = model(*inputs)
+
+        DYN = torch.export.Dim.DYNAMIC
+
+        # patching
+        with torch_export_patches(patch_transformers=True, verbose=10):
+            got = model(*inputs)
+            self.assertEqualArray(expected, got)
+            ep = torch.export.export(
+                model,
+                inputs,
+                dynamic_shapes=(
+                    {0: DYN, 2: DYN},
+                    [[{0: DYN, 2: DYN}], [{0: DYN, 2: DYN}]],
+                ),
+                strict=False,
             )
-            model = Model()
-            inputs = (torch.randn((5, 6, 5, 6)), cache)
-            expected = model(*inputs)
-
-            DYN = torch.export.Dim.DYNAMIC
-
-            # patching
-            with torch_export_patches(patch_transformers=True, verbose=10):
-                got = model(*inputs)
-                self.assertEqualArray(expected, got)
-                ep = torch.export.export(
-                    model,
-                    inputs,
-                    dynamic_shapes=(
-                        {0: DYN, 2: DYN},
-                        [[{0: DYN, 2: DYN}], [{0: DYN, 2: DYN}]],
-                    ),
-                    strict=strict,
-                )
-                mod = ep.module()
-                got = mod(*inputs)
-                self.assertEqualArray(expected, got)
-
-                class MyInterpreter(torch.fx.Interpreter):
-                    def call_function(self, target, args, kwargs):
-                        res = super().call_function(target, args, kwargs)
-                        return res
-
-                args, _spec = torch.utils._pytree.tree_flatten(inputs)
-                got = MyInterpreter(ep.module()).run(*args)
-                self.assertEqualAny(expected, got)
+            mod = ep.module()
+            got = mod(*inputs)
+            self.assertEqualArray(expected, got)
+
+            class MyInterpreter(torch.fx.Interpreter):
+                def call_function(self, target, args, kwargs):
+                    res = super().call_function(target, args, kwargs)
+                    return res
+
+            args, _spec = torch.utils._pytree.tree_flatten(inputs)
+            got = MyInterpreter(ep.module()).run(*args)
+            self.assertEqualAny(expected, got)
 
     @ignore_warnings(UserWarning)
     @requires_torch(
diff --git a/onnx_diagnostic/helpers/cache_helper.py b/onnx_diagnostic/helpers/cache_helper.py
@@ -183,7 +183,7 @@ def make_dynamic_cache(
             f"Unexpected number of layers in the cache ({len(cache.layers)}), "
             f"{len(key_value_pairs)} expected."
         )
-        return cache
+        return finalize_cache(cache)
 
 else:
 
@@ -335,7 +335,7 @@ def get_text_config(self):
         f"Unexpected number of layers in the cache ({len(cache.layers)}), "
         f"{len(key_value_pairs)} expected."
     )
-    return cache
+    return finalize_cache(cache)
 
 
 def make_encoder_decoder_cache(
@@ -391,7 +391,7 @@ def get_text_config(self):
             f"got {key_value_pairs[i][1].shape}"
         )
         cache.ssm_states[i][:, :, :] = key_value_pairs[i][1]
-    return cache
+    return finalize_cache(cache)
 
 
 def make_sliding_window_cache(
@@ -446,7 +446,7 @@ def get_text_config(self):
         f"Unexpected number of layers in the cache ({len(cache.layers)}), "
         f"{len(key_value_pairs)} expected."
     )
-    return cache
+    return finalize_cache(cache)
 
 
 def make_hybrid_cache(
@@ -605,4 +605,21 @@ def get_text_config(self):
         f"Unexpected number of layers in the cache ({len(cache.layers)}), "
         f"{len(key_value_pairs)} expected."
     )
+    return finalize_cache(cache)
+
+
+def finalize_cache(cache: transformers.cache_utils.Cache) -> transformers.cache_utils.Cache:
+    """
+    Ensures the created cache is consistent.
+    Returns the cache modified inplace.
+    """
+    if (
+        hasattr(cache, "layer_class_to_replicate")
+        and hasattr(cache, "layers")
+        and cache.layers
+        and not cache.layer_class_to_replicate
+    ):
+        # This is used to expand the cache when it does not contains enough layers.
+        # This is needed since transformers>4.55.3
+        cache.layer_class_to_replicate = cache.layers[0].__class__
     return cache
diff --git a/onnx_diagnostic/torch_export_patches/patch_inputs.py b/onnx_diagnostic/torch_export_patches/patch_inputs.py
@@ -34,7 +34,7 @@ def _make_shape(subset: Dict, cls: type, value: Any) -> Any:
             f"Inconsistencies in subset={subset}, found={values}, "
             f"it cannot be a {cls}, value={string_type(value)}"
         )
-        cache_length = len(value.key_cache)
+        cache_length = len(value.layers if hasattr(value, "layers") else value.key_cache)
         for v in subset.values():
             axes = v
             break

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ def _make_shape(subset: Dict, cls: type, value: Any) -> Any:`
`34`	`34`	`f"Inconsistencies in subset={subset}, found={values}, "`
`35`	`35`	`f"it cannot be a {cls}, value={string_type(value)}"`
`36`	`36`	`)`
`37`		`- cache_length = len(value.key_cache)`
	`37`	`+ cache_length = len(value.layers if hasattr(value, "layers") else value.key_cache)`
`38`	`38`	`for v in subset.values():`
`39`	`39`	`axes = v`
`40`	`40`	`break`