*In ModelConfig, allow user to specify the negative infinity mask to override the default float(-inf). Since in certain accelerators, they couldn't handle inf numbers well.

haozha111 · copybara-github · commit 7fcec7ef7fb9 · 2025-05-02T18:56:20.000-07:00
*Also updated the mask computation logic in ExportConfig.

PiperOrigin-RevId: 754207101
diff --git a/ai_edge_torch/generative/examples/gemma3/decoder.py b/ai_edge_torch/generative/examples/gemma3/decoder.py
@@ -199,7 +199,11 @@ def create_sliding_mask(
     sliding_mask = torch.where(
         sliding_mask_bool,
         torch.zeros_like(sliding_mask_bool, dtype=torch.float),
-        torch.full_like(sliding_mask_bool, float("-inf"), dtype=torch.float),
+        torch.full_like(
+            sliding_mask_bool,
+            self.config.get_causal_mask_value(),
+            dtype=torch.float,
+        ),
     )
 
     return sliding_mask
@@ -215,7 +219,7 @@ def compose_mask(
       mask = torch.logical_and(mask, pixel_mask)
     else:
       mask = torch.logical_or(mask, pixel_mask)
-    mask = torch.where(mask, 0, float("-inf"))
+    mask = torch.where(mask, 0, self.config.get_causal_mask_value())
     return mask
 
   def build_pixel_mask(self, image_indices: torch.Tensor):
diff --git a/ai_edge_torch/generative/examples/paligemma/decoder.py b/ai_edge_torch/generative/examples/paligemma/decoder.py
@@ -75,7 +75,7 @@ def forward(
     if mask is None:
       embeds_len = input_embeds.shape[1]
       mask = torch.zeros(embeds_len, self.config.kv_cache_max)
-      mask[:, embeds_len:] = float("-inf")
+      mask[:, embeds_len:] = attn_config.causal_mask_value
 
     return self._forward_with_embeds(
         input_embeds,
diff --git a/ai_edge_torch/generative/examples/paligemma/decoder2.py b/ai_edge_torch/generative/examples/paligemma/decoder2.py
@@ -75,7 +75,7 @@ def forward(
       # By default, don't mask image embeds with a diagonal causal mask.
       embeds_len = input_embeds.shape[1]
       mask = torch.zeros(embeds_len, self.config.kv_cache_max)
-      mask[:, embeds_len:] = float("-inf")
+      mask[:, embeds_len:] = attn_config.causal_mask_value
 
     return self._forward_with_embeds(
         input_embeds, rope, mask, input_pos, kv_cache, export_config
diff --git a/ai_edge_torch/generative/layers/model_config.py b/ai_edge_torch/generative/layers/model_config.py
@@ -116,6 +116,8 @@ class AttentionConfig:
   attn_type: Optional[AttentionType] = None
   # The size of the sliding window used for local attention.
   sliding_window_size: Optional[int] = None
+  # The default causal mask value used by attention layer.
+  causal_mask_value: float = float("-inf")
 
 
 @dataclasses.dataclass
@@ -247,3 +249,7 @@ def block_config(self, idx: int) -> TransformerBlockConfig:
           f"Index {idx} is out of range for layer configs: {self.block_configs}"
       )
     return self.block_configs[idx]
+
+  @property
+  def get_causal_mask_value(self) -> float:
+    return self.block_config(0).attn_config.causal_mask_value
diff --git a/ai_edge_torch/generative/utilities/converter.py b/ai_edge_torch/generative/utilities/converter.py
@@ -95,6 +95,18 @@ def define_conversion_flags(model_name: str):
   return flags
 
 
+def _build_mask(mask_len, kv_cache_max_len, causal_mask_value) -> torch.Tensor:
+  if isinstance(mask_len, list):
+    return [
+        _build_mask(i, kv_cache_max_len, causal_mask_value) for i in mask_len
+    ]
+
+  mask = torch.full(
+      (mask_len, kv_cache_max_len), causal_mask_value, dtype=torch.float32
+  )
+  return torch.triu(mask, diagonal=1).unsqueeze(0).unsqueeze(0)
+
+
 def convert_to_tflite(
     pytorch_model: torch.nn.Module,
     output_path: str,
@@ -229,14 +241,15 @@ def _export_helper(
           torch.arange(0, seq_len + pixel_seq_len, dtype=torch.int)
       )
 
-  if export_config.prefill_mask is None:
-    prefill_masks = None
-  elif isinstance(export_config.prefill_mask, torch.Tensor):
-    prefill_masks = [export_config.prefill_mask]
-  elif isinstance(export_config.prefill_mask, list):
-    prefill_masks = export_config.prefill_mask
-  else:
-    raise ValueError('Prefill masks unrecognized.')
+  prefill_masks = None
+  if flags.FLAGS.mask_as_input:
+    prefill_masks = [
+        _build_mask(
+            flags.FLAGS.prefill_seq_lens,
+            flags.FLAGS.kv_cache_max_len,
+            config.get_causal_mask_value(),
+        )
+    ]
 
   if prefill_masks:
     assert len(prefill_masks) == len(prefill_seq_lens)
@@ -299,8 +312,17 @@ def _export_helper(
         'input_pos': decode_input_pos,
         'kv_cache': decode_kv,
     }
-    if export_config.decode_mask is not None:
-      sample_kwargs['mask'] = export_config.decode_mask
+    if flags.FLAGS.mask_as_input:
+      # Note that the decode mask is not a correct causal mask, but it is okay
+      # for the conversion purpose because only the shape matters in conversion.
+      # A correct causal mask of decode for a given token position of decode, it
+      # should be built like:
+      #
+      #  torch.triu(mask, diagonal=decode_position).unsqueeze(0).unsqueeze(0)
+      #
+      sample_kwargs['mask'] = _build_mask(
+          1, flags.FLAGS.kv_cache_max_len, config.get_causal_mask_value()
+      )
     if lora is not None:
       sample_kwargs['lora'] = lora
 
diff --git a/ai_edge_torch/generative/utilities/export_config.py b/ai_edge_torch/generative/utilities/export_config.py
@@ -33,6 +33,8 @@ class ExportConfig:
   # When False, only decode signatures will produce output.
   output_logits_on_prefill: bool = False
   # Attention masks given as inputs to the model.
+  # Note that `prefill_mask`, `decode_mask`, and `kvcache_cls` are deprecated
+  # and will be removed in a future version.
   prefill_mask: Optional[torch.Tensor | List[torch.Tensor]] = None
   decode_mask: Optional[torch.Tensor | List[torch.Tensor]] = None
   # The KV Cache layout for K and V buffers in attention.
@@ -43,33 +45,10 @@ class ExportConfig:
   decode_batch_size: int = 1
 
 
-def _build_mask(mask_len, kv_cache_max_len) -> torch.Tensor:
-  if isinstance(mask_len, list):
-    return [_build_mask(i, kv_cache_max_len) for i in mask_len]
-
-  mask = torch.full(
-      (mask_len, kv_cache_max_len), float('-inf'), dtype=torch.float32
-  )
-  return torch.triu(mask, diagonal=1).unsqueeze(0).unsqueeze(0)
-
-
 def get_from_flags() -> ExportConfig:
   """Builds an export config according to the commandline flags."""
   export_config = ExportConfig()
 
-  if flags.FLAGS.mask_as_input:
-    export_config.prefill_mask = _build_mask(
-        flags.FLAGS.prefill_seq_lens, flags.FLAGS.kv_cache_max_len
-    )
-    # Note that the decode mask is not a correct causal mask, but it is okay
-    # for the conversion purpose because only the shape matters in conversion.
-    # A correct causal mask of decode for a given token position of decode, it
-    # should be built like:
-    #
-    #  torch.triu(mask, diagonal=decode_position).unsqueeze(0).unsqueeze(0)
-    #
-    export_config.decode_mask = _build_mask(1, flags.FLAGS.kv_cache_max_len)
-
   if flags.FLAGS.transpose_kv_cache:
     export_config.kvcache_layout = kv_utils.KV_LAYOUT_TRANSPOSED