huggingface
diff --git a/‎docs/source/en/internal/generation_utils.md
Lines changed: 11 additions & 18 deletions b/‎docs/source/en/internal/generation_utils.md
Lines changed: 11 additions & 18 deletions
diff --git a/‎docs/source/en/kv_cache.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/kv_cache.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/llm_optims.md
Lines changed: 1 addition & 4 deletions b/‎docs/source/en/llm_optims.md
Lines changed: 1 addition & 4 deletions
diff --git a/‎docs/source/en/model_doc/gemma2.md
Lines changed: 1 addition & 2 deletions b/‎docs/source/en/model_doc/gemma2.md
Lines changed: 1 addition & 2 deletions
diff --git a/‎docs/source/ko/internal/generation_utils.md
Lines changed: 4 additions & 18 deletions b/‎docs/source/ko/internal/generation_utils.md
Lines changed: 4 additions & 18 deletions
diff --git a/‎docs/source/ko/llm_optims.md
Lines changed: 1 addition & 4 deletions b/‎docs/source/ko/llm_optims.md
Lines changed: 1 addition & 4 deletions
diff --git a/‎src/transformers/__init__.py
Lines changed: 8 additions & 7 deletions b/‎src/transformers/__init__.py
Lines changed: 8 additions & 7 deletions
@@ -363,37 +363,34 @@ A [`Constraint`] can be used to force the generation to include specific tokens
     - get_max_cache_shape
     - reset
     - reorder_cache
+    - lazy_initialization
 
 [[autodoc]] DynamicLayer
     - update
+    - lazy_initialization
     - crop
     - batch_repeat_interleave
     - batch_select_indices
 
 [[autodoc]] StaticLayer
     - update
+    - lazy_initialization
 
 [[autodoc]] SlidingWindowLayer
     - update
+    - lazy_initialization
 
-[[autodoc]] CacheProcessor
-    - pre_update
-    - post_update
-
-[[autodoc]] OffloadedCacheProcessor
-    - pre_update
-
-[[autodoc]] QuantizedCacheProcessor
-    - post_update
-
-[[autodoc]] QuantoQuantizedCacheProcessor
-    - post_update
+[[autodoc]] QuantoQuantizedLayer
+    - update
+    - lazy_initialization
 
-[[autodoc]] HQQQuantizedCacheProcessor
-    - post_update
+[[autodoc]] HQQQuantizedLayer
+    - update
+    - lazy_initialization
 
 [[autodoc]] Cache
     - update
+    - early_initialization
     - get_seq_length
     - get_mask_sizes
     - get_max_cache_shape
@@ -411,12 +408,8 @@ A [`Constraint`] can be used to force the generation to include specific tokens
 
 [[autodoc]] QuantoQuantizedCache
 
-[[autodoc]] QuantoQuantizedCacheProcessor
-
 [[autodoc]] HQQQuantizedCache
 
-[[autodoc]] HQQQuantizedCacheProcessor
-
 [[autodoc]] OffloadedCache
 
 [[autodoc]] StaticCache
 
@@ -312,7 +312,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 # Init StaticCache with big enough max-length (1024 tokens for the below example)
 # You can also init a DynamicCache, if that suits you better
-prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device=model.device.type, dtype=torch.bfloat16)
+prompt_cache = StaticCache(config=model.config, max_cache_len=1024)
 
 INITIAL_PROMPT = "You are a helpful assistant. "
 inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(model.device.type)
 
@@ -93,11 +93,8 @@ model.generation_config.max_new_tokens = 16
 
 past_key_values = StaticCache(
     config=model.config,
-    max_batch_size=1,
     # If you plan to reuse the cache, make sure the cache length is large enough for all cases
     max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2),
-    device=model.device,
-    dtype=model.dtype
 )
 outputs = model.generate(**input_ids, past_key_values=past_key_values)
 print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
@@ -159,7 +156,7 @@ from torch.nn.attention import SDPBackend, sdpa_kernel
 batch_size, seq_length = inputs["input_ids"].shape
 with torch.no_grad():
     past_key_values = StaticCache(
-        config=model.config, max_batch_size=2, max_cache_len=4096, device=torch_device, dtype=model.dtype
+        config=model.config, max_cache_len=4096
     )
     cache_position = torch.arange(seq_length, device=torch_device)
     generated_ids = torch.zeros(
 
@@ -138,8 +138,7 @@ visualizer("You are an assistant. Make sure you print me")
 
     inputs = tokenizer(text="My name is Gemma", return_tensors="pt")
     max_generated_length = inputs.input_ids.shape[1] + 10
-    past_key_values = HybridCache(config=model.config, max_batch_size=1,
-    max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
+    past_key_values = HybridCache(config=model.config, max_cache_len=max_generated_length)
     outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
     ```
 
 
@@ -362,21 +362,11 @@ generation_output[:2]
 [[autodoc]] SlidingWindowLayer
     - update
 
-[[autodoc]] CacheProcessor
-    - pre_update
-    - post_update
-
-[[autodoc]] OffloadedCacheProcessor
-    - pre_update
-
-[[autodoc]] QuantizedCacheProcessor
-    - post_update
-
-[[autodoc]] QuantoQuantizedCacheProcessor
-    - post_update
+[[autodoc]] QuantoQuantizedLayer
+    - update
 
-[[autodoc]] HQQQuantizedCacheProcessor
-    - post_update
+[[autodoc]] HQQQuantizedLayer
+    - update
 
 [[autodoc]] Cache
     - update
@@ -397,12 +387,8 @@ generation_output[:2]
 
 [[autodoc]] QuantoQuantizedCache
 
-[[autodoc]] QuantoQuantizedCacheProcessor
-
 [[autodoc]] HQQQuantizedCache
 
-[[autodoc]] HQQQuantizedCacheProcessor
-
 [[autodoc]] OffloadedCache
 
 [[autodoc]] StaticCache
 
@@ -99,11 +99,8 @@ model.generation_config.max_new_tokens = 16
 
 past_key_values = StaticCache(
     config=model.config,
-    max_batch_size=1,
     # 캐시를 재사용할 계획이 있는 경우, 모든 경우에 충분한 캐시 길이를 설정해야 합니다
     max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2),
-    device=model.device,
-    dtype=model.dtype
 )
 outputs = model.generate(**input_ids, past_key_values=past_key_values)
 print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
@@ -161,7 +158,7 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu
 batch_size, seq_length = inputs["input_ids"].shape
 with torch.no_grad():
     past_key_values = StaticCache(
-        config=model.config, max_batch_size=2, max_cache_len=4096, device=torch_device, dtype=model.dtype
+        config=model.config, max_cache_len=4096
     )
     cache_position = torch.arange(seq_length, device=torch_device)
     generated_ids = torch.zeros(
 
@@ -377,23 +377,18 @@
         "StaticLayer",
         "SlidingWindowLayer",
         "ChunkedSlidingLayer",
-        "CacheProcessor",
-        "OffloadedCacheProcessor",
-        "QuantizedCacheProcessor",
-        "QuantoQuantizedCacheProcessor",
-        "HQQQuantizedCacheProcessor",
+        "QuantoQuantizedLayer",
+        "HQQQuantizedLayer",
         "Cache",
         "CacheConfig",
         "DynamicCache",
         "EncoderDecoderCache",
         "HQQQuantizedCache",
-        "HQQQuantizedCacheProcessor",
         "HybridCache",
         "HybridChunkedCache",
         "OffloadedCache",
         "OffloadedStaticCache",
         "QuantizedCache",
-        "QuantoQuantizedCacheProcessor",
         "QuantizedCacheConfig",
         "QuantoQuantizedCache",
         "SinkCache",
@@ -586,19 +581,25 @@
     # All modeling imports
     from .cache_utils import Cache as Cache
     from .cache_utils import CacheConfig as CacheConfig
+    from .cache_utils import ChunkedSlidingLayer as ChunkedSlidingLayer
     from .cache_utils import DynamicCache as DynamicCache
+    from .cache_utils import DynamicLayer as DynamicLayer
     from .cache_utils import EncoderDecoderCache as EncoderDecoderCache
     from .cache_utils import HQQQuantizedCache as HQQQuantizedCache
+    from .cache_utils import HQQQuantizedLayer as HQQQuantizedLayer
     from .cache_utils import HybridCache as HybridCache
     from .cache_utils import MambaCache as MambaCache
     from .cache_utils import OffloadedCache as OffloadedCache
     from .cache_utils import OffloadedStaticCache as OffloadedStaticCache
     from .cache_utils import QuantizedCache as QuantizedCache
     from .cache_utils import QuantizedCacheConfig as QuantizedCacheConfig
     from .cache_utils import QuantoQuantizedCache as QuantoQuantizedCache
+    from .cache_utils import QuantoQuantizedLayer as QuantoQuantizedLayer
     from .cache_utils import SinkCache as SinkCache
     from .cache_utils import SlidingWindowCache as SlidingWindowCache
+    from .cache_utils import SlidingWindowLayer as SlidingWindowLayer
     from .cache_utils import StaticCache as StaticCache
+    from .cache_utils import StaticLayer as StaticLayer
     from .configuration_utils import PretrainedConfig as PretrainedConfig
     from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS as SLOW_TO_FAST_CONVERTERS
     from .convert_slow_tokenizer import convert_slow_tokenizer as convert_slow_tokenizer