Support MambaCache in torch_deepcopy

xadupre · xadupre · commit ec103a4e74ff · 2025-04-25T14:41:16.000+02:00
diff --git a/_unittests/ut_helpers/test_torch_test_helper.py b/_unittests/ut_helpers/test_torch_test_helper.py
@@ -2,8 +2,9 @@
 import ml_dtypes
 import onnx
 import torch
+import transformers
 from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout
-from onnx_diagnostic.helpers import string_type
+from onnx_diagnostic.helpers import max_diff, string_type
 from onnx_diagnostic.helpers.torch_test_helper import (
     dummy_llm,
     to_numpy,
@@ -13,7 +14,12 @@
     to_any,
     torch_deepcopy,
 )
-from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache, make_encoder_decoder_cache
+from onnx_diagnostic.helpers.cache_helper import (
+    make_dynamic_cache,
+    make_encoder_decoder_cache,
+    make_mamba_cache,
+    make_sliding_window_cache,
+)
 
 TFLOAT = onnx.TensorProto.FLOAT
 
@@ -85,19 +91,66 @@ def test_to_any(self):
         at = to_any(a, torch.float16)
         self.assertIn("T10r", string_type(at))
 
-    def test_torch_deepcopy(self):
+    def test_torch_deepcopy_cache_dce(self):
         c1 = make_dynamic_cache([(torch.rand((4, 4, 4)), torch.rand((4, 4, 4)))])
         c2 = make_encoder_decoder_cache(
             make_dynamic_cache([(torch.rand((4, 4, 4)), torch.rand((4, 4, 4)))]),
             make_dynamic_cache([(torch.rand((5, 5, 5)), torch.rand((5, 5, 5)))]),
         )
+        cc = torch_deepcopy(c2)
+        self.assertEqual(type(c2), type(c2))
+        self.assertEqual(max_diff(c2, cc)["abs"], 0)
         a = {"t": [(torch.tensor([1, 2]), c1, c2), {4, 5}]}
         at = torch_deepcopy(a)
         hash1 = string_type(at, with_shape=True, with_min_max=True)
         c1.key_cache[0] += 1000
         hash2 = string_type(at, with_shape=True, with_min_max=True)
         self.assertEqual(hash1, hash2)
 
+    def test_torch_deepcopy_mamba_cache(self):
+        cache = make_mamba_cache(
+            [
+                (torch.rand((4, 4, 4)), torch.rand((4, 4, 4))),
+                (torch.rand((4, 4, 4)), torch.rand((4, 4, 4))),
+                (torch.rand((4, 4, 4)), torch.rand((4, 4, 4))),
+            ]
+        )
+        at = torch_deepcopy(cache)
+        self.assertEqual(type(cache), type(at))
+        self.assertEqual(max_diff(cache, at)["abs"], 0)
+        hash1 = string_type(at, with_shape=True, with_min_max=True)
+        cache.conv_states[0] += 1000
+        hash2 = string_type(at, with_shape=True, with_min_max=True)
+        self.assertEqual(hash1, hash2)
+
+    def test_torch_deepcopy_base_model_outputs(self):
+        bo = transformers.modeling_outputs.BaseModelOutput(
+            last_hidden_state=torch.rand((4, 4, 4))
+        )
+        at = torch_deepcopy(bo)
+        self.assertEqual(max_diff(bo, at)["abs"], 0)
+        self.assertEqual(type(bo), type(at))
+        hash1 = string_type(at, with_shape=True, with_min_max=True)
+        bo.last_hidden_state[0] += 1000
+        hash2 = string_type(at, with_shape=True, with_min_max=True)
+        self.assertEqual(hash1, hash2)
+
+    def test_torch_deepcopy_sliding_windon_cache(self):
+        cache = make_sliding_window_cache(
+            [
+                (torch.rand((4, 5, 6, 7)), torch.rand((4, 5, 6, 7))),
+                (torch.rand((4, 5, 6, 7)), torch.rand((4, 5, 6, 7))),
+                (torch.rand((4, 5, 6, 7)), torch.rand((4, 5, 6, 7))),
+            ]
+        )
+        at = torch_deepcopy(cache)
+        self.assertEqual(type(cache), type(at))
+        self.assertEqual(max_diff(cache, at)["abs"], 0)
+        hash1 = string_type(at, with_shape=True, with_min_max=True)
+        cache.key_cache[0] += 1000
+        hash2 = string_type(at, with_shape=True, with_min_max=True)
+        self.assertEqual(hash1, hash2)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/__init__.py b/onnx_diagnostic/__init__.py
@@ -3,5 +3,5 @@
 Functions, classes to dig into a model when this one is right, slow, wrong...
 """
 
-__version__ = "0.4.1"
+__version__ = "0.4.2"
 __author__ = "Xavier Dupré"
diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
@@ -1404,6 +1404,28 @@ def max_diff(
             f"level={level}"
         )
 
+    if expected.__class__.__name__ == "SlidingWindowCache":
+        if got.__class__.__name__ == "SlidingWindowCache":
+            if verbose >= 6:
+                print(f"[max_diff] DynamicCache: {string_type(expected)} ? {string_type(got)}")
+            return max_diff(
+                [expected.key_cache, expected.value_cache],
+                [got.key_cache, got.value_cache],
+                verbose=verbose,
+            )
+        if isinstance(got, tuple) and len(got) == 2:
+            return max_diff(
+                [expected.key_cache, expected.value_cache],
+                [got[0], got[1]],
+                verbose=verbose,
+            )
+        raise AssertionError(
+            f"SlidingWindowCache not fully implemented with classes "
+            f"{expected.__class__.__name__!r} and {got.__class__.__name__!r}, "
+            f"and expected={string_type(expected)}, got={string_type(got)},\n"
+            f"level={level}"
+        )
+
     if expected.__class__.__name__ == "EncoderDecoderCache":
         if got.__class__.__name__ == "EncoderDecoderCache":
             if verbose >= 6:
diff --git a/onnx_diagnostic/helpers/torch_test_helper.py b/onnx_diagnostic/helpers/torch_test_helper.py
@@ -8,6 +8,7 @@
     make_dynamic_cache,
     make_encoder_decoder_cache,
     make_sliding_window_cache,
+    make_mamba_cache,
 )
 
 
@@ -376,6 +377,9 @@ def torch_deepcopy(value: Any) -> Any:
             torch_deepcopy(value.self_attention_cache),
             torch_deepcopy(value.cross_attention_cache),
         )
+    if value.__class__.__name__ == "MambaCache":
+        return make_mamba_cache(list(zip(value.conv_states, value.ssm_states)))
+
     if value.__class__ in torch.utils._pytree.SUPPORTED_NODES:
         args, spec = torch.utils._pytree.tree_flatten(value)
         new_args = torch_deepcopy(args)