Internal code change.

hheydary · copybara-github · commit 883075e5f77b · 2025-02-27T14:18:28.000-08:00
PiperOrigin-RevId: 731863023
diff --git a/ai_edge_torch/generative/layers/experimental/attention.py b/ai_edge_torch/generative/layers/experimental/attention.py
@@ -52,7 +52,6 @@ def __init__(
         config.pre_attention_norm_config,
     )
     self.atten_func = CausalSelfAttention(
-        model_config.batch_size,
         model_config.embedding_dim,
         config.attn_config,
         model_config.enable_hlfb,
@@ -119,22 +118,19 @@ class CausalSelfAttention(nn.Module):
 
   def __init__(
       self,
-      batch_size: int,
       dim: int,
       config: cfg.AttentionConfig,
       enable_hlfb: bool,
   ) -> None:
     """Initialize an instance of CausalSelfAttention.
 
     Args:
-      batch_size (int): batch size of the input tensor.
       dim (int): causal attention's input/output dimmension.
       config (cfg.AttentionConfig): attention specific configurations.
       enable_hlfb (bool): whether hlfb is enabled or not.
     """
     super().__init__()
     self.kv_cache = None
-    self.batch_size = batch_size
     qkv_shape = (
         config.num_heads + 2 * config.num_query_groups
     ) * config.head_dim
@@ -180,10 +176,6 @@ def forward(
     """
     # Batch size, sequence length, embedding dimensionality.
     B, T, E = x.size()
-    assert B == self.batch_size, (
-        "batch size of input tensor must match with the batch size specified in"
-        " the model configuration."
-    )
 
     qkv = self.qkv_projection(x)
 
diff --git a/ai_edge_torch/generative/layers/experimental/kv_cache.py b/ai_edge_torch/generative/layers/experimental/kv_cache.py
@@ -21,23 +21,19 @@
 import dataclasses
 from typing import List, Tuple
 
-from ai_edge_torch import hlfb
 from ai_edge_torch.generative.layers import model_config
-from ai_edge_torch.generative.layers.experimental import types as types
-from ai_edge_torch.generative.utilities.dynamic_update_slice import dynamic_update_slice
+from ai_edge_torch.generative.layers.experimental import types
+from ai_edge_torch.generative.utilities import dynamic_update_slice as dus_utils
 import torch
-import torch.nn as nn
 import torch.utils._pytree as pytree
 
-BATCH_SIZE = 1
-
 
 @dataclasses.dataclass
 class KVCacheEntryBase:
   """A single cache entry that includes K and V caches.
 
   The chaches are built based on the provided config with the shape of
-  (batch_size=1, kv_cache_max, num_query_groups, head_dim).
+  (batch_size, kv_cache_max, num_query_groups, head_dim).
   """
 
   k_cache: torch.Tensor
@@ -46,10 +42,8 @@ class KVCacheEntryBase:
   @classmethod
   def _from_model_config(
       cls,
-      kv_cache_max: int,
-      config: model_config.AttentionConfig,
-      k_shape: Tuple,
-      v_shape: Tuple,
+      k_shape: Tuple[int, ...],
+      v_shape: Tuple[int, ...],
       dtype: torch.dtype = torch.float32,
       device: torch.device = None,
   ) -> "KVCacheEntryBase":
@@ -66,12 +60,11 @@ def from_model_config(
       config: model_config.AttentionConfig,
       dtype: torch.dtype = torch.float32,
       device: torch.device = None,
+      batch_size: int = 1,
   ) -> "KVCacheEntryBase":
     """Build an instance of the class based on model config."""
-    shape = (BATCH_SIZE, kv_cache_max, config.num_query_groups, config.head_dim)
-    return cls._from_model_config(
-        kv_cache_max, config, shape, shape, dtype, device
-    )
+    shape = (batch_size, kv_cache_max, config.num_query_groups, config.head_dim)
+    return cls._from_model_config(shape, shape, dtype, device)
 
 
 @dataclasses.dataclass
@@ -93,24 +86,22 @@ def from_model_config(
       config: model_config.AttentionConfig,
       dtype: torch.dtype = torch.float32,
       device: torch.device = None,
+      batch_size: int = 1,
   ) -> "KVCacheEntryBase":
     """Build an instance of the class based on model config."""
-    num_kv_heads = config.num_query_groups
     k_shape = (
-        1,
-        BATCH_SIZE * num_kv_heads,
+        batch_size,
+        config.num_query_groups,
         kv_cache_max,
         config.head_dim,
-    )  # 1, bk, s, h
+    )  # b, k, s, h
     v_shape = (
-        1,
-        BATCH_SIZE * num_kv_heads,
+        batch_size,
+        config.num_query_groups,
         config.head_dim,
         kv_cache_max,
-    )  # 1, bk, h, s
-    return cls._from_model_config(
-        kv_cache_max, config, k_shape, v_shape, dtype, device
-    )
+    )  # b, k, h, s
+    return cls._from_model_config(k_shape, v_shape, dtype, device)
 
 
 @dataclasses.dataclass
@@ -126,13 +117,15 @@ def _from_model_config(
       config: model_config.ModelConfig,
       dtype: torch.dtype = torch.float32,
       device: torch.device = None,
+      batch_size: int = 1,
   ) -> "KVCacheBase":
     caches = [
         kv_entry_cls.from_model_config(
             config.kv_cache_max,
             config.block_config(idx).attn_config,
             dtype,
             device,
+            batch_size,
         )
         for idx in range(config.num_layers)
     ]
@@ -145,6 +138,7 @@ def from_model_config(
       config: model_config.ModelConfig,
       dtype: torch.dtype = torch.float32,
       device: torch.device = None,
+      batch_size: int = 1,
   ) -> "KVCacheBase":
     """Build an instance of the class based on model config.
 
@@ -154,12 +148,19 @@ def from_model_config(
           Defaults to torch.float32.
         device (torch.device, optional): The device placement of the cache
           tensors. Defaults to None.
+        batch_size (int, optional): The batch size of the cache tensors.
+          Defaults to 1.
 
     Returns:
         KVCacheBase: The created cache object.
     """
+    assert batch_size == 1, "Batch size must be 1 for KV Cache."
     return cls._from_model_config(
-        KVCacheEntryBase, config=config, dtype=dtype, device=device
+        KVCacheEntryBase,
+        config=config,
+        dtype=dtype,
+        device=device,
+        batch_size=batch_size,
     )
 
   def flatten(self) -> List[torch.Tensor]:
@@ -177,9 +178,14 @@ def from_model_config(
       config: model_config.ModelConfig,
       dtype: torch.dtype = torch.float32,
       device: torch.device = None,
+      batch_size: int = 1,
   ) -> "KVCacheBTNH":
     return cls._from_model_config(
-        KVCacheEntryBTNH, config=config, dtype=dtype, device=device
+        KVCacheEntryBTNH,
+        config=config,
+        dtype=dtype,
+        device=device,
+        batch_size=batch_size,
     )
 
 
@@ -192,9 +198,14 @@ def from_model_config(
       config: model_config.ModelConfig,
       dtype: torch.dtype = torch.float32,
       device: torch.device = None,
+      batch_size: int = 1,
   ) -> "KVCacheBTNH":
     return cls._from_model_config(
-        KVCacheEntryTransposed, config=config, dtype=dtype, device=device
+        KVCacheEntryTransposed,
+        config=config,
+        dtype=dtype,
+        device=device,
+        batch_size=batch_size,
     )
 
 
@@ -258,7 +269,6 @@ def update(
     input_pos: torch.Tensor,
     k_slice: torch.Tensor,
     v_slice: torch.Tensor,
-    use_dus: bool = True,
 ) -> KVCacheEntryBase:
   """Out of place update of Cache buffer.
 
@@ -309,6 +319,10 @@ def _update_kv_impl(
   positions = input_pos.clone()
   k_slice_indices = _get_slice_indices(positions, cache_dim, k_ts_idx)
   v_slice_indices = _get_slice_indices(positions, cache_dim, v_ts_idx)
-  k = dynamic_update_slice(cache.k_cache, k_slice, [x for x in k_slice_indices])
-  v = dynamic_update_slice(cache.v_cache, v_slice, [x for x in v_slice_indices])
+  k = dus_utils.dynamic_update_slice(
+      cache.k_cache, k_slice, [x for x in k_slice_indices]
+  )
+  v = dus_utils.dynamic_update_slice(
+      cache.v_cache, v_slice, [x for x in v_slice_indices]
+  )
   return KVCacheEntryTransposed(k, v)