fixes

xadupre · xadupre · commit 4d45b48f5962 · 2025-07-24T15:39:25.000+02:00
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,7 +4,8 @@ Change Logs
 0.7.6
 +++++
 
-* :pr:`192`: add support for Gemma-3, add serialization for HybridCache
+* :pr:`192`: add support for Gemma-3, add serialization for HybridCache,
+  changes to support ``transformers>=4.54``
 
 0.7.5
 +++++
diff --git a/_unittests/ut_torch_models/test_tiny_llms_bypassed.py b/_unittests/ut_torch_models/test_tiny_llms_bypassed.py
@@ -52,7 +52,7 @@ def debug():
                 strict=False,
             )
             got = ep.module()(**inputs)
-            self.assertEqualArrayAny(expected, got)
+            self.assertEqualArrayAny(expected, got, atol=1e-5)
 
     @ignore_warnings(UserWarning)
     def test_export_phi2_2_bypassed(self):
diff --git a/onnx_diagnostic/helpers/cache_helper.py b/onnx_diagnostic/helpers/cache_helper.py
@@ -362,6 +362,7 @@ def make_hybrid_cache(
     key_value_pairs: List[Tuple[torch.Tensor, torch.Tensor]],
     max_cache_len: Optional[int] = None,
     max_batch_size: Optional[int] = None,
+    sliding_window: Optional[int] = None,
 ) -> transformers.cache_utils.HybridCache:
     """
     Creates an instance of :class:`transformers.cache_utils.HybridCache`.
@@ -392,30 +393,110 @@ def make_hybrid_cache(
             ]
         )
         print(string_type(past_key_values, with_shape=True))
+
+    This part defines how the shapes are working in one HybridCache.
+
+    .. code-block:: python
+
+            self.max_cache_len = (
+                max_cache_len if max_cache_len is not None else config.max_position_embeddings)
+
+            # Sliding layers can't be larger than the overall max cache len
+            self.sliding_window_len = min(config.sliding_window, self.max_cache_len)
+            self.max_batch_size = max_batch_size
+
+            self.head_dim = (
+                config.head_dim if hasattr(config, "head_dim")
+                else config.hidden_size // config.num_attention_heads
+            )
+
+            self._dtype = dtype
+            self.num_key_value_heads = (
+                config.num_attention_heads
+                if getattr(config, "num_key_value_heads", None) is None
+                else config.num_key_value_heads
+            )
+
+            # If the attribute does not exist in the config, fallback to a simple StaticCache
+            if hasattr(config, "layer_types"):
+                self.is_sliding = [
+                    layer_type != "full_attention" for layer_type in config.layer_types]
+            else:
+                self.is_sliding = [False] * config.num_hidden_layers
+
+            self.key_cache: list[torch.Tensor] = []
+            self.value_cache: list[torch.Tensor] = []
+            global_cache_shape = (self.max_batch_size, self.num_key_value_heads,
+                                  self.max_cache_len, self.head_dim)
+            sliding_cache_shape = (self.max_batch_size, self.num_key_value_heads,
+                                   self.sliding_window_len, self.head_dim)
+            self.sliding_window = min(config.sliding_window, max_cache_len)
+            device = torch.device(device) if device is not None else None
+            for i in range(config.num_hidden_layers):
+                layer_device = layer_device_map[i] if layer_device_map is not None else device
+                cache_shape = sliding_cache_shape if self.is_sliding[i] else global_cache_shape
+                new_layer_key_cache = torch.zeros(
+                    cache_shape, dtype=self._dtype, device=layer_device)
+                new_layer_value_cache = torch.zeros(
+                    cache_shape, dtype=self._dtype, device=layer_device)
+                torch._dynamo.mark_static_address(new_layer_key_cache)
+                torch._dynamo.mark_static_address(new_layer_value_cache)
+                self.key_cache.append(new_layer_key_cache)
+                self.value_cache.append(new_layer_value_cache)
     """
+    layer_types = None
     if key_value_pairs:
         assert (
             not max_batch_size and not max_cache_len
         ), "key_value_pairs is not empty, do not specify max_cache_len and max_batch_size"
         max_batch_size = key_value_pairs[0][0].shape[0]
-        max_cache_len = key_value_pairs[0][0].shape[2]
+        sets_of_dim = set(kv[0].shape[2] for kv in key_value_pairs)
+        if len(sets_of_dim) == 1:
+            max_cache_len = sets_of_dim.pop()
+            sliding_window = max_cache_len
+        else:
+            assert (
+                len(sets_of_dim) == 2
+            ), f"Not implemented for more than 2 dimensions {sets_of_dim}"
+            max_cache_len = max(sets_of_dim)
+            sliding_window = min(sets_of_dim)
+            layer_types = [
+                "full_attention" if i == max_cache_len else "sliding_attention"
+                for i in [kv[0].shape[2] for kv in key_value_pairs]
+            ]
     else:
         assert (
             max_batch_size and max_cache_len
         ), "key_value_pairs is empty, max_batch_size and max_cache_len are required"
-    _ = max_cache_len
+        if sliding_window is None:
+            sliding_window = max_cache_len
+    _max_cache_len = max_cache_len
+    _sliding_window = sliding_window
 
     class _config:
-        max_cache_len = _
+        max_cache_len = _max_cache_len
         batch_size = max_batch_size
         num_heads = key_value_pairs[0][0].shape[1] if key_value_pairs else None
         head_dim = key_value_pairs[0][0].shape[-1] if key_value_pairs else None
         num_attention_heads = key_value_pairs[0][1].shape[1] if key_value_pairs else None
         num_hidden_layers = len(key_value_pairs)
+        sliding_window = _sliding_window
+
+    if layer_types:
+        _config.layer_types = layer_types
 
     cache = transformers.cache_utils.HybridCache(
         config=_config(), max_cache_len=max_cache_len, max_batch_size=max_batch_size
     )
     for i, (key, value) in enumerate(key_value_pairs):
-        cache.update(key, value, i)
+        cache.update(
+            key,
+            value,
+            i,
+            cache_kwargs={
+                "cache_position": torch.arange(0, key.shape[2], dtype=torch.int64).to(
+                    key.device
+                )
+            },
+        )
     return cache

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ def debug():`
`52`	`52`	`strict=False,`
`53`	`53`	`)`
`54`	`54`	`got = ep.module()(**inputs)`
`55`		`- self.assertEqualArrayAny(expected, got)`
	`55`	`+ self.assertEqualArrayAny(expected, got, atol=1e-5)`
`56`	`56`
`57`	`57`	`@ignore_warnings(UserWarning)`
`58`	`58`	`def test_export_phi2_2_bypassed(self):`