Support custom batch size in decode signature.

hheydary · copybara-github · commit f114cce466c6 · 2025-02-26T07:55:03.000-08:00
PiperOrigin-RevId: 731316538
diff --git a/ai_edge_torch/generative/examples/smollm/convert_to_tflite.py b/ai_edge_torch/generative/examples/smollm/convert_to_tflite.py
@@ -22,7 +22,7 @@
 from absl import flags
 from ai_edge_torch.generative.examples.smollm import smollm
 from ai_edge_torch.generative.utilities import converter
-from ai_edge_torch.generative.utilities.model_builder import ExportConfig
+from ai_edge_torch.generative.utilities import model_builder
 
 _CHECKPOINT_PATH = flags.DEFINE_string(
     'checkpoint_path',
@@ -59,6 +59,11 @@
     None,
     'If set, the model will be converted with the provided list of LoRA ranks.',
 )
+_DECODE_BATCH_SIZE = flags.DEFINE_integer(
+    'decode_batch_size',
+    1,
+    'The batch size for the decode signature.',
+)
 
 
 def main(_):
@@ -72,7 +77,9 @@ def main(_):
       prefill_seq_len=_PREFILL_SEQ_LENS.value,
       quantize=_QUANTIZE.value,
       lora_ranks=_LORA_RANKS.value,
-      export_config=ExportConfig(),
+      export_config=model_builder.ExportConfig(
+          decode_batch_size=_DECODE_BATCH_SIZE.value
+      ),
   )
 
 
diff --git a/ai_edge_torch/generative/examples/smollm/convert_v2_to_tflite.py b/ai_edge_torch/generative/examples/smollm/convert_v2_to_tflite.py
@@ -22,17 +22,22 @@
 from absl import flags
 from ai_edge_torch.generative.examples.smollm import smollm
 from ai_edge_torch.generative.utilities import converter
-from ai_edge_torch.generative.utilities.model_builder import ExportConfig
+from ai_edge_torch.generative.utilities import model_builder
 
 _CHECKPOINT_PATH = flags.DEFINE_string(
     'checkpoint_path',
     os.path.join(pathlib.Path.home(), 'Downloads/llm_data/smollm2'),
     'The path to the model checkpoint, or directory holding the checkpoint.',
 )
-_TFLITE_PATH = flags.DEFINE_string(
-    'tflite_path',
+_OUTPUT_PATH = flags.DEFINE_string(
+    'output_path',
     '/tmp/',
-    'The tflite file path to export.',
+    'The path to export the tflite model.',
+)
+_OUTPUT_NAME_PREFIX = flags.DEFINE_string(
+    'output_name_prefix',
+    'smollm2',
+    'The prefix of the output tflite model name.',
 )
 _PREFILL_SEQ_LENS = flags.DEFINE_multi_integer(
     'prefill_seq_lens',
@@ -49,21 +54,33 @@
     True,
     'Whether the model should be quantized.',
 )
+_LORA_RANKS = flags.DEFINE_multi_integer(
+    'lora_ranks',
+    None,
+    'If set, the model will be converted with the provided list of LoRA ranks.',
+)
+_DECODE_BATCH_SIZE = flags.DEFINE_integer(
+    'decode_batch_size',
+    1,
+    'The batch size for the decode signature.',
+)
 
 
 def main(_):
   pytorch_model = smollm.build_model_v2(
       _CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
   )
 
-  quant_suffix = 'q8' if _QUANTIZE.value else 'f32'
-  output_filename = f'smollm2_{quant_suffix}_ekv{_KV_CACHE_MAX_LEN.value}.tflite'
   converter.convert_to_tflite(
       pytorch_model,
-      tflite_path=os.path.join(_TFLITE_PATH.value, output_filename),
+      output_path=_OUTPUT_PATH.value,
+      output_name_prefix=_OUTPUT_NAME_PREFIX.value,
       prefill_seq_len=_PREFILL_SEQ_LENS.value,
       quantize=_QUANTIZE.value,
-      export_config=ExportConfig(),
+      lora_ranks=_LORA_RANKS.value,
+      export_config=model_builder.ExportConfig(
+          decode_batch_size=_DECODE_BATCH_SIZE.value
+      ),
   )
 
 
diff --git a/ai_edge_torch/generative/fx_passes/test/test_remove_sdpa_zero_mask_pass.py b/ai_edge_torch/generative/fx_passes/test/test_remove_sdpa_zero_mask_pass.py
@@ -65,7 +65,6 @@ def __init__(self, config: unet_cfg.AttentionBlock2DConfig):
         super().__init__()
         self.config = config
         self.attention = SelfAttention(
-            config.attention_batch_size,
             config.dim,
             config.attention_config,
             enable_hlfb=config.enable_hlfb,
diff --git a/ai_edge_torch/generative/layers/attention.py b/ai_edge_torch/generative/layers/attention.py
@@ -48,7 +48,6 @@ def __init__(
         config.pre_attention_norm_config,
     )
     self.atten_func = CausalSelfAttention(
-        model_config.batch_size,
         model_config.embedding_dim,
         config.attn_config,
         model_config.enable_hlfb,
@@ -115,22 +114,19 @@ class CausalSelfAttention(nn.Module):
 
   def __init__(
       self,
-      batch_size: int,
       dim: int,
       config: cfg.AttentionConfig,
       enable_hlfb: bool,
   ) -> None:
     """Initialize an instance of CausalSelfAttention.
 
     Args:
-      batch_size (int): batch size of the input tensor.
       dim (int): causal attention's input/output dimmension.
       config (cfg.AttentionConfig): attention specific configurations.
       enable_hlfb (bool): whether hlfb is enabled or not.
     """
     super().__init__()
     self.kv_cache = None
-    self.batch_size = batch_size
     qkv_shape = (
         config.num_heads + 2 * config.num_query_groups
     ) * config.head_dim
@@ -179,11 +175,6 @@ def forward(
     """
     # Batch size, sequence length, embedding dimensionality.
     B, T, E = x.size()
-    assert B == self.batch_size, (
-        "batch size of input tensor must match with the batch size specified in"
-        " the model configuration."
-    )
-
     qkv = self.qkv_projection(x)
 
     # Assemble into a number of query groups to support MHA, MQA and GQA.
@@ -290,7 +281,6 @@ class CrossAttention(nn.Module):
 
   def __init__(
       self,
-      batch_size: int,
       query_dim: int,
       cross_dim: int,
       hidden_dim: int,
@@ -301,7 +291,6 @@ def __init__(
     """Initialize an instance of CrossAttention.
 
     Args:
-      batch_size (int): batch size of the input tensor.
       query_dim (int): query tensor's dimension.
       cross_dim (int): cross attention's dimensions, for key and value tensors.
       hidden_dim (int): hidden dimension that q, k, v tensors project to.
diff --git a/ai_edge_torch/generative/layers/kv_cache.py b/ai_edge_torch/generative/layers/kv_cache.py
@@ -18,14 +18,11 @@
 import dataclasses
 from typing import List, Tuple
 
-from ai_edge_torch import hlfb
 from ai_edge_torch.generative.layers import model_config
 from ai_edge_torch.generative.utilities.dynamic_update_slice import dynamic_update_slice
 import torch
 import torch.utils._pytree as pytree
 
-BATCH_SIZE = 1
-
 
 @dataclasses.dataclass
 class KVCacheEntry:
@@ -45,9 +42,10 @@ def from_model_config(
       config: model_config.AttentionConfig,
       dtype: torch.dtype = torch.float32,
       device: torch.device = None,
+      batch_size: int = 1,
   ) -> "KVCacheEntry":
     """Build an instance of the class based on model config."""
-    shape = (BATCH_SIZE, kv_cache_max, config.num_query_groups, config.head_dim)
+    shape = (batch_size, kv_cache_max, config.num_query_groups, config.head_dim)
     k = torch.zeros(shape, dtype=dtype, device=device)
     v = torch.zeros(shape, dtype=dtype, device=device)
     obj = cls(k_cache=k, v_cache=v)
@@ -66,6 +64,7 @@ def from_model_config(
       config: model_config.ModelConfig,
       dtype: torch.dtype = torch.float32,
       device: torch.device = None,
+      batch_size: int = 1,
   ) -> "KVCache":
     """Build an instance of the class based on model config.
 
@@ -75,17 +74,21 @@ def from_model_config(
           Defaults to torch.float32.
         device (torch.device, optional): The device placement of the cache
           tensors. Defaults to None.
+        batch_size (int, optional): The batch size of the cache tensors.
+          Defaults to 1.
 
     Returns:
         KVCache: The created cache object.
     """
     caches = [
         KVCacheEntry.from_model_config(
-            config.kv_cache_max if not config.block_config(idx).kv_cache_max_len 
+            config.kv_cache_max
+            if not config.block_config(idx).kv_cache_max_len
             else config.block_config(idx).kv_cache_max_len,
             config.block_config(idx).attn_config,
             dtype,
             device,
+            batch_size,
         )
         for idx in range(config.num_layers)
     ]
diff --git a/ai_edge_torch/generative/layers/model_config.py b/ai_edge_torch/generative/layers/model_config.py
@@ -220,9 +220,6 @@ class ModelConfig:
   # The maximum sequence length of the KV cache. Should not exceed max_seq_len.
   kv_cache_max_len: int = 0
 
-  # Default batch size of the exported model. Default value is 1.
-  batch_size: int = 1
-
   # Softcap on the model output logits.
   final_logit_softcap: Optional[float] = None
 
diff --git a/ai_edge_torch/generative/utilities/converter.py b/ai_edge_torch/generative/utilities/converter.py
@@ -110,6 +110,11 @@ def convert_to_tflite(
   lora_suffix = (
       '' if not lora_ranks else f'_lora{",".join(map(str, lora_ranks))}'
   )
+
+  if export_config is not None:
+    if export_config.decode_batch_size > 1:
+      output_name_prefix += f'_dbs{export_config.decode_batch_size}'
+
   output_filename = (
       f'{output_name_prefix}_{quant_suffix}_ekv{kv_size}{lora_suffix}.tflite'
   )
@@ -162,9 +167,14 @@ def _export_helper(
   if prefill_masks:
     assert len(prefill_masks) == len(prefill_seq_lens)
 
-  decode_token = torch.tensor([[0]], dtype=torch.int)
+  decode_token = torch.tensor(
+      [[0] for _ in range(export_config.decode_batch_size)], dtype=torch.int
+  )
   decode_input_pos = torch.tensor([0], dtype=torch.int)
-  kv = export_config.kvcache_cls.from_model_config(config)
+  prefill_kv = export_config.kvcache_cls.from_model_config(config)
+  decode_kv = export_config.kvcache_cls.from_model_config(
+      config, batch_size=export_config.decode_batch_size
+  )
 
   quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
 
@@ -183,7 +193,7 @@ def _export_helper(
       sample_kwargs = {
           'tokens': prefill_tokens,
           'input_pos': prefill_input_pos,
-          'kv_cache': kv,
+          'kv_cache': prefill_kv,
       }
       if prefill_masks is not None:
         sample_kwargs['mask'] = prefill_masks[i]
@@ -211,7 +221,7 @@ def _export_helper(
     sample_kwargs = {
         'tokens': decode_token,
         'input_pos': decode_input_pos,
-        'kv_cache': kv,
+        'kv_cache': decode_kv,
     }
     if export_config.decode_mask is not None:
       sample_kwargs['mask'] = export_config.decode_mask
diff --git a/ai_edge_torch/generative/utilities/model_builder.py b/ai_edge_torch/generative/utilities/model_builder.py
@@ -60,6 +60,8 @@ class ExportConfig:
   decode_mask: Optional[torch.Tensor | List[torch.Tensor]] = None
   # The KV Cache class for K and V buffers in attention.
   kvcache_cls: type = kv_utils.KVCache
+  # The batch size of the decode signature.
+  decode_batch_size: int = 1
 
 
 class DecoderOnlyModel(nn.Module):