internal changes only

ai-edge-bot · copybara-github · commit 243fae810231 · 2025-01-23T19:55:03.000-08:00
PiperOrigin-RevId: 719113596
diff --git a/ai_edge_torch/generative/layers/kv_cache.py b/ai_edge_torch/generative/layers/kv_cache.py
@@ -81,7 +81,8 @@ def from_model_config(
     """
     caches = [
         KVCacheEntry.from_model_config(
-            config.kv_cache_max,
+            config.kv_cache_max if not config.block_config(idx).kv_cache_max_len 
+            else config.block_config(idx).kv_cache_max_len,
             config.block_config(idx).attn_config,
             dtype,
             device,
diff --git a/ai_edge_torch/generative/layers/model_config.py b/ai_edge_torch/generative/layers/model_config.py
@@ -164,6 +164,9 @@ class TransformerBlockConfig:
   parallel_residual: bool = False
   # The Attention computation will include relative positional bias.
   relative_attention: bool = False
+  # KV Cache length for this block. Only used when attention types are different
+  # across blocks
+  kv_cache_max_len: Optional[int] = None
 
 
 @dataclasses.dataclass
@@ -200,7 +203,8 @@ class ModelConfig:
   embedding_use_bias: bool = False
   # Image embedding parameters.
   image_embedding: Optional[ImageEmbeddingConfig] = None
-
+  # Number of image tokens 
+  num_mm_tokens_per_image: Optional[int] = None
   # Use bias term within LLM's HEAD.
   lm_head_use_bias: bool = False
   # Whether LLM's HEAD shares the weight of the embedding.