fix caches

xadupre · xadupre · commit bfac0a1673cb · 2025-10-29T11:18:05.000+01:00
diff --git a/_unittests/ut_helpers/test_helper.py b/_unittests/ut_helpers/test_helper.py
@@ -10,6 +10,7 @@
     skipif_ci_windows,
     hide_stdout,
     requires_onnx,
+    requires_transformers,
 )
 from onnx_diagnostic.helpers.helper import (
     string_type,
@@ -40,7 +41,13 @@
     onnx_dtype_to_torch_dtype,
     torch_dtype_to_onnx_dtype,
 )
-from onnx_diagnostic.helpers.cache_helper import make_dynamic_cache, make_encoder_decoder_cache
+from onnx_diagnostic.helpers.cache_helper import (
+    make_dynamic_cache,
+    make_encoder_decoder_cache,
+    make_static_cache,
+    make_hybrid_cache,
+    make_sliding_window_cache,
+)
 from onnx_diagnostic.torch_models.hghub.hub_api import get_pretrained_config
 
 
@@ -584,11 +591,55 @@ def test_flatten_encoder_decoder_cache(self):
         s = string_type(inputs)
         self.assertIn("EncoderDecoderCache", s)
 
-    def test_string_typeçconfig(self):
+    def test_string_type_config(self):
         conf = get_pretrained_config("microsoft/phi-2", use_only_preinstalled=True)
         s = string_type(conf)
         self.assertStartsWith("PhiConfig(**{", s)
 
+    @requires_transformers("4.55")
+    def test_max_diff_causal_output(self):
+        from transformers.modeling_outputs import CausalLMOutputWithPast
+
+        logits = torch.rand((3, 4))
+        cache = make_dynamic_cache([(torch.rand((3, 4)), torch.rand((3, 4)))])
+        out1 = CausalLMOutputWithPast(logits=logits, past_key_values=cache)
+        out2 = CausalLMOutputWithPast(logits=logits, past_key_values=cache)
+        self.assertEqual(max_diff(out1, out2)["abs"], 0)
+        self.assertEqual(
+            max_diff(out1, [logits, cache.layers[0].keys, cache.layers[0].values])["abs"], 0
+        )
+
+    def test_max_diff_others(self):
+        t = torch.rand((3, 4))
+        self.assertEqual(max_diff(t, t)["abs"], 0)
+        self.assertEqual(max_diff([t], [t])["abs"], 0)
+        self.assertEqual(max_diff([t], (t,))["abs"], 0)
+        self.assertEqual(max_diff((t,), [t])["abs"], 0)
+        self.assertEqual(max_diff((t,), (t,))["abs"], 0)
+        self.assertEqual(max_diff({"t": t}, {"t": t})["abs"], 0)
+
+    def test_max_diff_caches(self):
+        cache = make_dynamic_cache([(torch.rand((3, 4)), torch.rand((3, 4)))])
+        self.assertEqual(max_diff(cache, cache)["abs"], 0)
+        cache = make_static_cache(
+            [(torch.rand((1, 1, 3, 4)), torch.rand((1, 1, 3, 4)))], max_cache_len=3
+        )
+        self.assertEqual(max_diff(cache, cache)["abs"], 0)
+        cache = make_hybrid_cache([(torch.rand((1, 1, 3, 4)), torch.rand((1, 1, 3, 4)))])
+        self.assertEqual(max_diff(cache, cache)["abs"], 0)
+        cache = make_sliding_window_cache(
+            [(torch.rand((1, 1, 3, 4)), torch.rand((1, 1, 3, 4)))]
+        )
+        self.assertEqual(max_diff(cache, cache)["abs"], 0)
+        cache = make_encoder_decoder_cache(cache, cache)
+        self.assertEqual(max_diff(cache, cache)["abs"], 0)
+
+    def test_max_diff_caches_flat(self):
+        data = [(torch.rand((3, 4)), torch.rand((3, 4)))]
+        cache1 = make_dynamic_cache(data)
+        cache2 = make_dynamic_cache([*data[0]])
+        self.assertEqual(max_diff(cache1, cache2)["abs"], 0)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_torch_models/test_validate_models.py b/_unittests/ut_torch_models/test_validate_models.py
@@ -39,7 +39,7 @@ def test_validate_tiny_llms_bfloat16(self):
         self.assertIn("onnx_filename", data)
 
     @requires_transformers("4.53")
-    @requires_torch("2.7.99")
+    @requires_torch("2.8.99")
     @requires_experimental()
     @hide_stdout()
     def test_validate_microsoft_phi4_reasoning(self):
@@ -60,7 +60,7 @@ def test_validate_microsoft_phi4_reasoning(self):
         self.assertIn("onnx_filename", data)
 
     @requires_transformers("4.53")
-    @requires_torch("2.7.99")
+    @requires_torch("2.8.99")
     @requires_experimental()
     @hide_stdout()
     def test_validate_microsoft_phi3_mini_128k(self):
diff --git a/onnx_diagnostic/helpers/cache_helper.py b/onnx_diagnostic/helpers/cache_helper.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import packaging.version as pv
 import torch
 import transformers
@@ -152,10 +152,18 @@ def make_dynamic_shapes_kv_cache(
     return [shape_of_one for _ in range(CacheKeyValue(cache).n_layers * 2)]
 
 
+def _preprocess_key_value_pairs(
+    key_value_pairs: Union[List[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]],
+) -> List[Tuple[torch.Tensor, torch.Tensor]]:
+    if not key_value_pairs or isinstance(key_value_pairs[0], tuple):
+        return key_value_pairs
+    return list(zip(key_value_pairs[::2], key_value_pairs[1::2]))
+
+
 if pv.Version(transformers.__version__) > pv.Version("4.49.99999"):
 
     def make_dynamic_cache(
-        key_value_pairs: List[Tuple[torch.Tensor, torch.Tensor]],
+        key_value_pairs: Union[List[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]],
     ) -> transformers.cache_utils.DynamicCache:
         """
         Creates an instance of :class:`transformers.cache_utils.DynamicCache`.
@@ -191,6 +199,7 @@ def make_dynamic_cache(
         ``transformers>=4.56``. Before that version, only FakeTensor with static dimensions
         are supported.
         """
+        key_value_pairs = _preprocess_key_value_pairs(key_value_pairs)
         if (
             key_value_pairs
             and isinstance(key_value_pairs[0][0], torch._subclasses.fake_tensor.FakeTensor)
@@ -230,7 +239,7 @@ def make_dynamic_cache(
 else:
 
     def make_dynamic_cache(
-        key_value_pairs: List[Tuple[torch.Tensor, torch.Tensor]],
+        key_value_pairs: Union[List[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]],
     ) -> transformers.cache_utils.DynamicCache:
         """
         Creates an instance of :class:`transformers.cache_utils.DynamicCache`.
@@ -262,14 +271,15 @@ def make_dynamic_cache(
             )
             print(string_type(past_key_values, with_shape=True))
         """
+        key_value_pairs = _preprocess_key_value_pairs(key_value_pairs)
         cache = transformers.cache_utils.DynamicCache(len(key_value_pairs))  # type: ignore
         for i, (key, value) in enumerate(key_value_pairs):
             cache.update(key, value, i)
         return cache
 
 
 def make_static_cache(
-    key_value_pairs: List[Tuple[torch.Tensor, torch.Tensor]],
+    key_value_pairs: Union[List[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]],
     max_cache_len: Optional[int] = None,
 ) -> transformers.cache_utils.DynamicCache:
     """
@@ -302,6 +312,7 @@ def make_static_cache(
         )
         print(string_type(past_key_values, with_shape=True))
     """
+    key_value_pairs = _preprocess_key_value_pairs(key_value_pairs)
 
     class _config:
         def __init__(self):
@@ -444,9 +455,10 @@ def get_text_config(self, *args, **kwargs):
 
 
 def make_sliding_window_cache(
-    key_value_pairs: List[Tuple[torch.Tensor, torch.Tensor]],
+    key_value_pairs: Union[List[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]],
 ) -> transformers.cache_utils.SlidingWindowCache:
     "Creates a :class:`transformers.cache_utils.SlidingWindowCache`."
+    key_value_pairs = _preprocess_key_value_pairs(key_value_pairs)
 
     class _config:
         def __init__(self):
@@ -499,7 +511,7 @@ def get_text_config(self, *args, **kwargs):
 
 
 def make_hybrid_cache(
-    key_value_pairs: List[Tuple[torch.Tensor, torch.Tensor]],
+    key_value_pairs: Union[List[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]],
     max_cache_len: Optional[int] = None,
     max_batch_size: Optional[int] = None,
     sliding_window: Optional[int] = None,
@@ -584,6 +596,7 @@ def make_hybrid_cache(
             self.key_cache.append(new_layer_key_cache)
             self.value_cache.append(new_layer_value_cache)
     """
+    key_value_pairs = _preprocess_key_value_pairs(key_value_pairs)
     layer_types = None
     if key_value_pairs:
         assert (
diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
@@ -1064,6 +1064,13 @@ def max_diff(
                 f"[max_diff] CausalLMOutputWithPast: {string_type(expected)} "
                 f"? {string_type(got)}"
             )
+        if got.__class__.__name__ == "CausalLMOutputWithPast":
+            return max_diff(
+                [expected.logits, *flatten_object(expected.past_key_values)],
+                [got.logits, *flatten_object(got.past_key_values)],
+                debug_info=_debug(expected.__class__.__name__),
+                **_dkws,
+            )
         return max_diff(
             [expected.logits, *flatten_object(expected.past_key_values)],
             got,