huggingface
diff --git a/‎src/transformers/cache_utils.py
Lines changed: 189 additions & 285 deletions b/‎src/transformers/cache_utils.py
Lines changed: 189 additions & 285 deletions
diff --git a/‎src/transformers/generation/configuration_utils.py
Lines changed: 15 additions & 28 deletions b/‎src/transformers/generation/configuration_utils.py
Lines changed: 15 additions & 28 deletions
diff --git a/‎src/transformers/generation/utils.py
Lines changed: 18 additions & 24 deletions b/‎src/transformers/generation/utils.py
Lines changed: 18 additions & 24 deletions
diff --git a/‎src/transformers/integrations/executorch.py
Lines changed: 6 additions & 10 deletions b/‎src/transformers/integrations/executorch.py
Lines changed: 6 additions & 10 deletions
diff --git a/‎src/transformers/masking_utils.py
Lines changed: 6 additions & 6 deletions b/‎src/transformers/masking_utils.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/transformers/models/bamba/modeling_bamba.py
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bamba/modeling_bamba.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bamba/modular_bamba.py
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bamba/modular_bamba.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/gpt_neo/modeling_gpt_neo.py
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/gpt_neo/modeling_gpt_neo.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
Lines changed: 6 additions & 10 deletions b/‎src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
Lines changed: 6 additions & 10 deletions
@@ -43,35 +43,23 @@
 
 logger = logging.get_logger(__name__)
 METADATA_FIELDS = ("_from_model_config", "_commit_hash", "_original_object_hash", "transformers_version")
-STATIC_CACHE_CLASSES_MAPPING = {}
-QUANT_BACKEND_CLASSES_MAPPING = {}
-ALL_CACHE_IMPLEMENTATIONS = []
+STATIC_CACHE_IMPLEMENTATIONS = ("static", "offloaded_static")
+DYNAMIC_CACHE_IMPLEMENTATIONS = ("dynamic", "offloaded", "quantized")
+# All the following are redundant and deprecated, but kept for BC
+DEPRECATED_STATIC_CACHE_IMPLEMENTATIONS = (
+    "sliding_window",
+    "hybrid",
+    "hybrid_chunked",
+    "offloaded_hybrid",
+    "offloaded_hybrid_chunked",
+)
+ALL_STATIC_CACHE_IMPLEMENTATIONS = STATIC_CACHE_IMPLEMENTATIONS + DEPRECATED_STATIC_CACHE_IMPLEMENTATIONS
+ALL_CACHE_IMPLEMENTATIONS = ALL_STATIC_CACHE_IMPLEMENTATIONS + DYNAMIC_CACHE_IMPLEMENTATIONS
+
 
 if is_torch_available():
-    from ..cache_utils import (
-        HQQQuantizedCache,
-        HybridCache,
-        HybridChunkedCache,
-        OffloadedHybridCache,
-        OffloadedStaticCache,
-        QuantoQuantizedCache,
-        SlidingWindowCache,
-        StaticCache,
-    )
     from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor
 
-    STATIC_CACHE_CLASSES_MAPPING = {
-        "static": StaticCache,
-        "offloaded_static": OffloadedStaticCache,
-        "sliding_window": SlidingWindowCache,
-        "hybrid": HybridCache,
-        "hybrid_chunked": HybridChunkedCache,
-        "offloaded_hybrid": OffloadedHybridCache,
-        "offloaded_hybrid_chunked": OffloadedHybridCache,
-    }
-    QUANT_BACKEND_CLASSES_MAPPING = {"quanto": QuantoQuantizedCache, "HQQ": HQQQuantizedCache}
-    ALL_CACHE_IMPLEMENTATIONS = list(STATIC_CACHE_CLASSES_MAPPING.keys()) + ["offloaded", "dynamic", "quantized"]
-
 
 class GenerationMode(ExplicitEnum):
     """
@@ -173,9 +161,8 @@ class GenerationConfig(PushToHubMixin):
 
             - `"dynamic"`: [`DynamicCache`]
             - `"static"`: [`StaticCache`]
-            - `"offloaded_static"`: [`OffloadedStaticCache`]
-            - `"sliding_window"`: [`SlidingWindowCache`]
-            - `"hybrid"`: [`HybridCache`]
+            - `"offloaded"`: [`DynamicCache(offloaded=True)`]
+            - `"offloaded_static"`: [`StaticCache(offloaded=True)`]
             - `"quantized"`: [`QuantizedCache`]
 
             If none is specified, we will use the default cache for the model (which is often [`DynamicCache`]). See
 
@@ -32,9 +32,8 @@
     Cache,
     DynamicCache,
     EncoderDecoderCache,
-    HybridChunkedCache,
-    OffloadedCache,
-    OffloadedHybridCache,
+    QuantizedCache,
+    StaticCache,
 )
 from ..configuration_utils import PretrainedConfig
 from ..dynamic_module_utils import (
@@ -71,8 +70,9 @@
     _prepare_token_type_ids,
 )
 from .configuration_utils import (
-    QUANT_BACKEND_CLASSES_MAPPING,
-    STATIC_CACHE_CLASSES_MAPPING,
+    ALL_STATIC_CACHE_IMPLEMENTATIONS,
+    DEPRECATED_STATIC_CACHE_IMPLEMENTATIONS,
+    STATIC_CACHE_IMPLEMENTATIONS,
     GenerationConfig,
     GenerationMode,
 )
@@ -1822,27 +1822,18 @@ def _get_cache(self, cache_implementation: str, batch_size: int, max_cache_len:
 
         Returns the resulting cache object.
         """
-        if cache_implementation == "hybrid" and "llama4" in getattr(self.config, "model_type", ""):
-            cache_implementation = "hybrid_chunked"
-
-        cache_cls: Cache = STATIC_CACHE_CLASSES_MAPPING[cache_implementation]
         requires_cross_attention_cache = (
             self.config.is_encoder_decoder or model_kwargs.get("encoder_outputs") is not None
         )
+        offload_cache = "offloaded" in cache_implementation
 
         if hasattr(self, "_cache"):
             cache_to_check = self._cache.self_attention_cache if requires_cross_attention_cache else self._cache
 
-        if cache_implementation == "sliding_window":
-            max_cache_len = min(self.config.sliding_window, max_cache_len)
-
         need_new_cache = (
             not hasattr(self, "_cache")
-            or (not isinstance(cache_to_check, cache_cls))
+            or cache_to_check.offloading != offload_cache
             or cache_to_check.max_batch_size != batch_size
-            or isinstance(
-                cache_to_check, (HybridChunkedCache, OffloadedHybridCache)
-            )  # due to internal slicing, we always re-init
             or cache_to_check.max_cache_len < max_cache_len
         )
 
@@ -1853,12 +1844,12 @@ def _get_cache(self, cache_implementation: str, batch_size: int, max_cache_len:
             )
 
         if need_new_cache:
-            cache_kwargs = {"config": self.config.get_text_config(), "max_cache_len": max_cache_len}
-            self._cache = cache_cls(**cache_kwargs)
+            cache_kwargs = {"config": self.config, "max_cache_len": max_cache_len, "offloading": offload_cache}
+            self._cache = StaticCache(**cache_kwargs)
             if requires_cross_attention_cache:
                 encoder_kwargs = cache_kwargs.copy()
                 encoder_kwargs["max_cache_len"] = model_kwargs["encoder_outputs"][0].shape[1]
-                self._cache = EncoderDecoderCache(self._cache, cache_cls(**encoder_kwargs))
+                self._cache = EncoderDecoderCache(self._cache, StaticCache(**encoder_kwargs))
         else:
             self._cache.reset()
         return self._cache
@@ -1957,7 +1948,12 @@ def _prepare_cache_for_generation(
             else {}
         )
         if generation_config.cache_implementation is not None:
-            if generation_config.cache_implementation in STATIC_CACHE_CLASSES_MAPPING:
+            if generation_config.cache_implementation in ALL_STATIC_CACHE_IMPLEMENTATIONS:
+                if generation_config.cache_implementation in DEPRECATED_STATIC_CACHE_IMPLEMENTATIONS:
+                    logger.warning_once(
+                        f"Using `cache_implementation='{generation_config.cache_implementation}' is deprecated. Please only "
+                        f"use one of {STATIC_CACHE_IMPLEMENTATIONS}, and the layer structure will be inferred automatically."
+                    )
                 model_kwargs[cache_name] = self._get_cache(
                     cache_implementation=generation_config.cache_implementation,
                     batch_size=max(generation_config.num_beams, generation_config.num_return_sequences) * batch_size,
@@ -1977,7 +1973,6 @@ def _prepare_cache_for_generation(
                     cache_config["config"] = self.config.get_text_config()
                 # Pop the backend from the config (defaults to quanto if not defined)
                 backend = cache_config.pop("backend", "quanto")
-                cache_class = QUANT_BACKEND_CLASSES_MAPPING[backend]
 
                 if backend == "quanto" and not is_optimum_quanto_available():
                     raise ImportError(
@@ -1989,10 +1984,9 @@ def _prepare_cache_for_generation(
                         "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
                         "Please install it via  with `pip install hqq`"
                     )
-
-                model_kwargs[cache_name] = cache_class(**cache_config)
+                model_kwargs[cache_name] = QuantizedCache(backend=backend, **cache_config)
             elif generation_config.cache_implementation == "offloaded":
-                model_kwargs[cache_name] = OffloadedCache()
+                model_kwargs[cache_name] = DynamicCache(**dynamic_cache_kwargs, offloading=True)
             elif generation_config.cache_implementation == "dynamic":
                 model_kwargs[cache_name] = DynamicCache(**dynamic_cache_kwargs)
 
 
@@ -20,7 +20,6 @@
     DynamicLayer,
     DynamicSlidingWindowLayer,
     EncoderDecoderCache,
-    HybridCache,
     StaticCache,
 )
 from ..generation.configuration_utils import GenerationConfig
@@ -38,9 +37,6 @@
 )
 
 
-# Add this to src/transformers/integrations/executorch.py
-
-
 class TorchExportableModuleForVLM:
     """
     A wrapper class for exporting Vision-Language Models (VLMs) like SmolVLM2 for ExecuTorch.
@@ -207,7 +203,7 @@ def __init__(
         model: PreTrainedModel,
     ):
         """
-        Initializes the exportable module with `HybridCache`.
+        Initializes the exportable module.
 
         Args:
             model (`PreTrainedModel`): The pretrained model to wrap.
@@ -636,7 +632,7 @@ def generate(
 class TorchExportableModuleWithHybridCache(torch.nn.Module):
     """
     A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
-    specifically for decoder-only LM to `HybridCache`. This module ensures that the
+    specifically for decoder-only LM to hybrid `StaticCache`. This module ensures that the
     exported model is compatible with further lowering and execution in `ExecuTorch`.
     """
 
@@ -645,13 +641,13 @@ def __init__(
         model: PreTrainedModel,
     ):
         """
-        Initializes the exportable module with `HybridCache`.
+        Initializes the exportable module.
 
         Args:
             model (`PreTrainedModel`): The pretrained model to wrap.
 
         Raises:
-            AssertionError: If the model doesn't have the expected configuration for HybridCache.
+            AssertionError: If the model doesn't have the expected configuration for an hybrid StaticCache.
         """
         super().__init__()
         self.model = model
@@ -676,8 +672,8 @@ def __init__(
         if not config.use_cache:
             raise AssertionError("Model must have caching enabled.")
 
-        # Initialize the HybridCache
-        self.cache = HybridCache(config=config, max_cache_len=generation_config.cache_config.get("max_cache_len"))
+        # Initialize the cache
+        self.cache = StaticCache(config=config, max_cache_len=generation_config.cache_config.get("max_cache_len"))
         head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
         num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
         max_batch_size = generation_config.cache_config.get("batch_size")
 
@@ -736,7 +736,7 @@ def create_causal_mask(
 ) -> Optional[Union[torch.Tensor, BlockMask]]:
     """
     Create a standard causal mask based on the attention implementation used (stored in the config). If `past_key_values`
-    has an HybridCache structure, this function will return the mask corresponding to one of the "full_attention" layers (to align
+    has an hybrid cache structure, this function will return the mask corresponding to one of the "full_attention" layers (to align
     to what is needed in the `modeling_xxx.py` files).
 
     Args:
@@ -761,7 +761,7 @@ def create_causal_mask(
             An optional mask function to combine with the causal mask function (by doing the intersection of both). This is
             useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
     """
-    # If we have an HybridCache structure, here we want to create the mask for the full layers
+    # If we have an hybrid cache structure, here we want to create the mask for the full layers
     if hasattr(past_key_values, "is_sliding") and False in past_key_values.is_sliding:
         layer_idx = past_key_values.is_sliding.index(False)
     else:
@@ -828,7 +828,7 @@ def create_sliding_window_causal_mask(
 ) -> Optional[Union[torch.Tensor, BlockMask]]:
     """
     Create a sliding window causal mask based on the attention implementation used (stored in the config). This type
-    of attention pattern was mostly democratized by Mistral. If `past_key_values` has an HybridCache structure, this
+    of attention pattern was mostly democratized by Mistral. If `past_key_values` has an hybrid cache structure, this
     function will return the mask corresponding to one of the "sliding_attention" layers (to align to what is needed in the
     `modeling_xxx.py` files).
 
@@ -854,7 +854,7 @@ def create_sliding_window_causal_mask(
             An optional mask function to combine with the sliding causal mask function (by doing the intersection of both). This is
             useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
     """
-    # If we have an HybridCache structure, here we want to create the mask for the sliding layers
+    # If we have an hybrid cache structure, here we want to create the mask for the sliding layers
     if hasattr(past_key_values, "is_sliding") and True in past_key_values.is_sliding:
         layer_idx = past_key_values.is_sliding.index(True)
     else:
@@ -923,7 +923,7 @@ def create_chunked_causal_mask(
 ) -> Optional[Union[torch.Tensor, BlockMask]]:
     """
     Create a chunked attention causal mask based on the attention implementation used (stored in the config). This type
-    of attention pattern was mostly democratized by Llama4. If `past_key_values` has an HybridCache structure, this
+    of attention pattern was mostly democratized by Llama4. If `past_key_values` has an hybrid cache structure, this
     function will return the mask corresponding to one of the "chunked_attention" layers (to align to what is needed in the
     `modeling_xxx.py` files).
 
@@ -949,7 +949,7 @@ def create_chunked_causal_mask(
             An optional mask function to combine with the chunked causal mask function (by doing the intersection of both). This is
             useful to easily overlay another mask on top of the chunked causal one, for example for image tokens handling.
     """
-    # If we have an HybridCache structure, here we want to create the mask for the sliding layers
+    # If we have an hybrid cache structure, here we want to create the mask for the sliding layers
     if hasattr(past_key_values, "is_sliding") and True in past_key_values.is_sliding:
         layer_idx = past_key_values.is_sliding.index(True)
     else:
 
@@ -467,7 +467,7 @@ class BambaMixer(nn.Module):
     and is why Mamba is called **selective** state spaces)
 
     The are a few differences between this and Mamba2Mixer:
-    - The variable use_precomputed_states is slightly different due to the HybridCache structure
+    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
     - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
     - Some extra variables that our layer doesn't need have been removed
     - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
 
@@ -225,7 +225,7 @@ class BambaMixer(nn.Module):
     and is why Mamba is called **selective** state spaces)
 
     The are a few differences between this and Mamba2Mixer:
-    - The variable use_precomputed_states is slightly different due to the HybridCache structure
+    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
     - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
     - Some extra variables that our layer doesn't need have been removed
     - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
 
@@ -477,7 +477,7 @@ class GPTNeoPreTrainedModel(PreTrainedModel):
     _no_split_modules = ["GPTNeoBlock"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn = True
-    _can_compile_fullgraph = False  # TODO: needs a HybridCache
+    _can_compile_fullgraph = False  # TODO: needs a hybrid cache
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
 
@@ -394,7 +394,7 @@ class GraniteMoeHybridMambaLayer(nn.Module):
     and is why Mamba is called **selective** state spaces)
 
     The are a few differences between this and Mamba2Mixer:
-    - The variable use_precomputed_states is slightly different due to the HybridCache structure
+    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
     - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
     - Some extra variables that our layer doesn't need have been removed
     - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
 
@@ -27,7 +27,7 @@
 import torch.nn as nn
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
+from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationConfig, GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
@@ -945,14 +945,9 @@ def _update_causal_mask(
         # to infer the attention mask.
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
-        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
 
         # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if (
-            self.config._attn_implementation == "sdpa"
-            and not (using_static_cache or using_sliding_window_cache)
-            and not output_attentions
-        ):
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
             if AttentionMaskConverter._ignore_causal_mask_sdpa(
                 attention_mask,
                 inputs_embeds=input_tensor,
@@ -965,8 +960,8 @@ def _update_causal_mask(
         dtype = input_tensor.dtype
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
-        # SlidingWindowCache or StaticCache
-        if using_sliding_window_cache or using_static_cache:
+        # StaticCache
+        if using_static_cache:
             target_length = past_key_values.get_max_cache_shape()
         # DynamicCache or no cache
         else:
@@ -1049,7 +1044,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             if getattr(text_config, "use_sliding_window", True) and text_config.sliding_window is not None:
                 # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
                 # the check is needed to verify is current checkpoint was trained with sliding window or not
-                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                is_static_sliding_cache = isinstance(past_key_values, StaticCache) and all(past_key_values.is_sliding)
+                if not is_static_sliding_cache or sequence_length > target_length:
                     sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= (
                         cache_position.reshape(-1, 1) - text_config.sliding_window
                     )