add support for hybrid cache

xadupre · xadupre · commit ca9dcc63f602 · 2025-07-22T19:47:13.000+02:00
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,8 @@ Change Logs
 0.7.6
 +++++
 
+* :pr:`192`: add support for Gemma-3, add serialization for HybridCache
+
 0.7.5
 +++++
 
diff --git a/_unittests/ut_helpers/test_cache_helper.py b/_unittests/ut_helpers/test_cache_helper.py
@@ -7,10 +7,12 @@
     flatten_unflatten_for_dynamic_shapes,
     make_dynamic_cache,
     make_encoder_decoder_cache,
+    make_hybrid_cache,
     make_mamba_cache,
     make_sliding_window_cache,
     make_static_cache,
 )
+from onnx_diagnostic.helpers.torch_helper import torch_deepcopy
 from onnx_diagnostic.export import CoupleInputsDynamicShapes
 from onnx_diagnostic.torch_export_patches.patch_inputs import (
     convert_dynamic_axes_into_dynamic_shapes,
@@ -209,6 +211,45 @@ def test_unflatten_flatten_static_cache(self):
                 self.string_type(unflat, with_shape=True),
             )
 
+    def test_make_hybrid_cache(self):
+        cache = make_hybrid_cache(
+            [
+                (torch.rand((4, 5, 6, 7)), torch.rand((4, 5, 6, 7))),
+                (torch.rand((4, 5, 6, 7)), torch.rand((4, 5, 6, 7))),
+                (torch.rand((4, 5, 6, 7)), torch.rand((4, 5, 6, 7))),
+            ],
+        )
+        text = self.string_type(cache, with_shape=True)
+        self.assertEqual(
+            "HybridCache(key_cache=#3[T1s4x5x6x7,T1s4x5x6x7,T1s4x5x6x7], "
+            "value_cache=#3[T1s4x5x6x7,T1s4x5x6x7,T1s4x5x6x7])",
+            text,
+        )
+        self.assertEqual(0, max_diff(cache, cache)["abs"])
+        self.assertEqual(0, max_diff(cache, torch_deepcopy(cache))["abs"])
+
+    def test_unflatten_flatten_hybrid_cache(self):
+        with torch_export_patches(patch_transformers=True):
+            c2 = make_hybrid_cache(
+                [
+                    (torch.rand((4, 5, 6, 7)), torch.rand((4, 5, 6, 7))),
+                    (torch.rand((4, 5, 6, 7)), torch.rand((4, 5, 6, 7))),
+                    (torch.rand((4, 5, 6, 7)), torch.rand((4, 5, 6, 7))),
+                ],
+            )
+            self.assertEqual(0, max_diff(c2, c2)["abs"])
+            self.assertIsInstance(c2, transformers.cache_utils.HybridCache)
+            flat, _spec = torch.utils._pytree.tree_flatten(c2)
+            self.assertIsInstance(flat, list)
+            self.assertEqual(len(flat), 6)
+            unflat = flatten_unflatten_for_dynamic_shapes(c2)
+            self.assertIsInstance(unflat, list)
+            self.assertEqual(len(unflat), 2)
+            self.assertEqual(
+                "#2[#3[T1s4x5x6x7,T1s4x5x6x7,T1s4x5x6x7],#3[T1s4x5x6x7,T1s4x5x6x7,T1s4x5x6x7]]",
+                self.string_type(unflat, with_shape=True),
+            )
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_tasks/test_tasks_image_text_to_text.py b/_unittests/ut_tasks/test_tasks_image_text_to_text.py
@@ -33,7 +33,7 @@ def test_image_text_to_text_idefics(self):
     @requires_transformers("4.53")
     @requires_torch("2.7.99")
     def test_image_text_to_text_gemma3(self):
-        mid = "tiny-random/gemma-3"
+        mid = "google/gemma-3-4b-it"
         data = get_untrained_model_with_inputs(mid, verbose=1, add_second_input=True)
         self.assertEqual(data["task"], "image-text-to-text")
         self.assertIn((data["size"], data["n_weights"]), [(34401152, 8600288)])
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -289,14 +289,41 @@ def test_imagetext2text_generation_idefics(self):
 
     @never_test()
     def test_imagetext2text_generation_gemma3(self):
+        """
+        ::
+
+            dict(input_ids:T7s1x281,
+                pixel_values:T16s1x3x896x896,
+                attention_mask:dict(full_attention:T9s1x1x281x380,sliding_attention:T9s1x1x281x380),
+                position_ids:T7s1x281,
+                past_key_values:HybridCache(
+                    key_cache=#34[T1s1x4x380x256,...],
+                    value_cache=#34[T1s1x4x380x256,...]),
+                token_type_ids:T7s1x281,
+                cache_position:T7s281,
+                logits_to_keep:1)
+            dict(input_ids:T7s1x1,
+                pixel_values:None,
+                attention_mask:dict(full_attention:T9s1x1x1x380,sliding_attention:T9s1x1x1x380),
+                position_ids:T7s1x1,
+                past_key_values:HybridCache(
+                    key_cache=#34[T1s1x4x380x256,...],
+                    value_cache=#34[T1s1x4x380x256,...]),
+                token_type_ids:T7s1x1,
+                cache_position:T7s1,
+                logits_to_keep:1)
+        """
+        from transformers import AutoProcessor, Gemma3ForConditionalGeneration
         import torch
-        from transformers import Gemma3ForConditionalGeneration, AutoProcessor
 
-        mid = "tiny-random/gemma-3"
-        processor = AutoProcessor.from_pretrained(mid)
+        # model_id = "tiny-random/gemma-3"
+        model_id = "google/gemma-3-4b-it"
+
         model = Gemma3ForConditionalGeneration.from_pretrained(
-            mid, torch_dtype=torch.bfloat16, device_map="auto"
-        )
+            model_id, device_map="auto"
+        ).eval()
+
+        processor = AutoProcessor.from_pretrained(model_id, use_fast=True)
 
         messages = [
             {
@@ -314,19 +341,26 @@ def test_imagetext2text_generation_gemma3(self):
                 ],
             },
         ]
+
         inputs = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
             return_tensors="pt",
         ).to(model.device, dtype=torch.bfloat16)
-        print()
-        with steal_forward(model):
-            generated_ids = model.generate(**inputs, max_new_tokens=10)
-        decoded = processor.decode(generated_ids, skip_special_tokens=True)
 
-        print(decoded[0])
+        input_len = inputs["input_ids"].shape[-1]
+
+        print()
+        print(f"-- input_len={input_len}")
+        # steal forward creates a bug...
+        # with steal_forward(model), torch.inference_mode():
+        with torch.inference_mode():
+            generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+            generation = generation[0][input_len:]
+        decoded = processor.decode(generation, skip_special_tokens=True)
+        print(decoded)
 
     @never_test()
     def test_automatic_speech_recognition(self):
diff --git a/onnx_diagnostic/helpers/cache_helper.py b/onnx_diagnostic/helpers/cache_helper.py
@@ -318,3 +318,65 @@ def __init__(self):
         )
         cache.value_cache[i][:, :, :, :] = key_value_pairs[i][1]
     return cache
+
+
+def make_hybrid_cache(
+    key_value_pairs: List[Tuple[torch.Tensor, torch.Tensor]],
+    max_cache_len: Optional[int] = None,
+    max_batch_size: Optional[int] = None,
+) -> transformers.cache_utils.HybridCache:
+    """
+    Creates an instance of :class:`transformers.cache_utils.HybridCache`.
+    This version is valid for ``transformers < 4.50``.
+
+    :param key_value_pairs: list of pairs of (key, values)
+    :return: :class:`transformers.cache_utils.HybridCache`
+
+    Example:
+
+    .. runpython::
+        :showcode:
+
+        import torch
+        from onnx_diagnostic.helpers import string_type
+        from onnx_diagnostic.helpers.cache_helper import make_hybrid_cache
+
+        n_layers = 2
+        bsize, nheads, slen, dim = 2, 4, 3, 7
+
+        past_key_values = make_hybrid_cache(
+            [
+                (
+                    torch.randn(bsize, nheads, slen, dim),
+                    torch.randn(bsize, nheads, slen, dim),
+                )
+                for i in range(n_layers)
+            ]
+        )
+        print(string_type(past_key_values, with_shape=True))
+    """
+    if key_value_pairs:
+        assert (
+            not max_batch_size and not max_cache_len
+        ), "key_value_pairs is not empty, do not specify max_cache_len and max_batch_size"
+        max_batch_size = key_value_pairs[0][0].shape[0]
+        max_cache_len = key_value_pairs[0][0].shape[2]
+    else:
+        assert (
+            max_batch_size and max_cache_len
+        ), "key_value_pairs is empty, max_batch_size and max_cache_len are required"
+    _ = max_cache_len
+
+    class _config:
+        max_cache_len = _
+        batch_size = max_batch_size
+        num_heads = key_value_pairs[0][0].shape[1] if key_value_pairs else None
+        head_dim = key_value_pairs[0][0].shape[-1] if key_value_pairs else None
+        num_attention_heads = key_value_pairs[0][1].shape[1] if key_value_pairs else None
+
+    cache = transformers.cache_utils.HybridCache(
+        _config(), max_cache_len=max_cache_len, max_batch_size=max_batch_size
+    )
+    for i, (key, value) in enumerate(key_value_pairs):
+        cache.update(key, value, i)
+    return cache
diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
@@ -565,15 +565,15 @@ def string_type(
         "HybridCache",
     }:
         kc = string_type(
-            obj.key_cache,
+            list(obj.key_cache),
             with_shape=with_shape,
             with_min_max=with_min_max,
             with_device=with_device,
             limit=limit,
             verbose=verbose,
         )
         vc = string_type(
-            obj.value_cache,
+            list(obj.value_cache),
             with_shape=with_shape,
             with_min_max=with_min_max,
             with_device=with_device,
@@ -584,6 +584,27 @@ def string_type(
             print(f"[string_type] CACHE2:{type(obj)}")
         return f"{obj.__class__.__name__}(key_cache={kc}, value_cache={vc})"
 
+    if obj.__class__.__name__ == "StaticLayer":
+        kc = string_type(
+            list(obj.keys),
+            with_shape=with_shape,
+            with_min_max=with_min_max,
+            with_device=with_device,
+            limit=limit,
+            verbose=verbose,
+        )
+        vc = string_type(
+            list(obj.values),
+            with_shape=with_shape,
+            with_min_max=with_min_max,
+            with_device=with_device,
+            limit=limit,
+            verbose=verbose,
+        )
+        if verbose:
+            print(f"[string_type] SL:{type(obj)}")
+        return f"{obj.__class__.__name__}(keys={kc}, values={vc})"
+
     if obj.__class__.__name__ == "EncoderDecoderCache":
         att = string_type(
             obj.self_attention_cache,
@@ -668,6 +689,24 @@ def string_type(
             f"dtype={obj.dtype}, shape={obj.shape})"
         )
 
+    if obj.__class__.__name__ == "KeyValuesWrapper":
+        import transformers
+
+        assert isinstance(
+            obj, transformers.cache_utils.KeyValuesWrapper
+        ), f"Unexpected type {type(obj)}"
+        if verbose:
+            print(f"[string_type] KW0:{type(obj)}")
+        s = string_type(
+            list(obj),
+            with_shape=with_shape,
+            with_min_max=with_min_max,
+            with_device=with_device,
+            limit=limit,
+            verbose=verbose,
+        )
+        return f"{obj.__class__.__name__}[{obj.cache_type}]{s}"
+
     if isinstance(obj, torch.nn.Module):
         if verbose:
             print(f"[string_type] MM:{type(obj)}")
@@ -1429,6 +1468,31 @@ def max_diff(
             f"level={level}"
         )
 
+    # backup function in case pytorch does not know how to serialize.
+    if expected.__class__.__name__ == "HybridCache":
+        if got.__class__.__name__ == "HybridCache":
+            if verbose >= 6:
+                print(f"[max_diff] HybridCache: {string_type(expected)} ? {string_type(got)}")
+            return max_diff(
+                [expected.key_cache, expected.value_cache],
+                [got.key_cache, got.value_cache],
+                verbose=verbose,
+                hist=hist,
+            )
+        if isinstance(got, tuple) and len(got) == 2:
+            return max_diff(
+                [expected.key_cache, expected.value_cache],
+                [got[0], got[1]],
+                debug_info=_debug(expected.__class__.__name__),
+                **_dkws,
+            )
+        raise AssertionError(
+            f"HybridCache not fully implemented with classes "
+            f"{expected.__class__.__name__!r} and {got.__class__.__name__!r}, "
+            f"and expected={string_type(expected)}, got={string_type(got)},\n"
+            f"level={level}"
+        )
+
     if expected.__class__.__name__ == "StaticCache":
         if got.__class__.__name__ == "StaticCache":
             if verbose >= 6:
@@ -1526,6 +1590,20 @@ def max_diff(
             **_dkws,
         )
 
+    if expected.__class__.__name__ == "KeyValuesWrapper":
+        if verbose >= 6:
+            print(f"[max_diff] KeyValuesWrapper: {string_type(expected)} ? {string_type(got)}")
+        if got.__class__.__name__ != expected.__class__.__name__:
+            return dict(abs=np.inf, rel=np.inf, sum=np.inf, n=np.inf, dnan=np.inf)
+        if got.cache_type != expected.cache_type:
+            return dict(abs=np.inf, rel=np.inf, sum=np.inf, n=np.inf, dnan=np.inf)
+        return max_diff(
+            list(expected),
+            list(got),
+            debug_info=_debug(expected.__class__.__name__),
+            **_dkws,
+        )
+
     raise AssertionError(
         f"Not implemented with implemented with expected="
         f"{string_type(expected)}, got={string_type(got)},\n"
diff --git a/onnx_diagnostic/helpers/torch_helper.py b/onnx_diagnostic/helpers/torch_helper.py
@@ -14,6 +14,7 @@
 from .cache_helper import (
     make_dynamic_cache,
     make_encoder_decoder_cache,
+    make_hybrid_cache,
     make_sliding_window_cache,
     make_mamba_cache,
     make_static_cache,
@@ -789,6 +790,8 @@ def torch_deepcopy(value: Any) -> Any:
             torch_deepcopy(list(zip(value.key_cache, value.value_cache))),
             max_cache_len=value.max_cache_len,
         )
+    if value.__class__.__name__ == "HybridCache":
+        return make_hybrid_cache(torch_deepcopy(list(zip(value.key_cache, value.value_cache))))
     if value.__class__.__name__ == "SlidingWindowCache":
         return make_sliding_window_cache(
             torch_deepcopy(list(zip(value.key_cache, value.value_cache)))
diff --git a/onnx_diagnostic/tasks/image_text_to_text.py b/onnx_diagnostic/tasks/image_text_to_text.py
diff --git a/onnx_diagnostic/torch_export_patches/onnx_export_serialization.py b/onnx_diagnostic/torch_export_patches/onnx_export_serialization.py
diff --git a/onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py b/onnx_diagnostic/torch_export_patches/serialization/transformers_impl.py

-Original file line number
+Diff line change
 .7.6
 +++++
 +* :pr:`192`: add support for Gemma-3, add serialization for HybridCache
++
 .7.5
 +++++