generic attn mask op

bmarimuthu-nv · bmarimuthu-nv · commit df120f0df818 · 2025-12-23T14:34:46.000-08:00
Signed-off-by: Balamurugan Marimuthu &lt;246387390+bmarimuthu-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py
@@ -224,6 +224,7 @@ def flashinfer_mha_with_cache(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
+    custom_mask: Optional[torch.Tensor],
     # STANDARD METADATA
     batch_info: torch.Tensor,
     cu_seqlen: torch.Tensor,
@@ -244,10 +245,6 @@ def flashinfer_mha_with_cache(
     v_scale: float,
     window_left: int,  # FlashInfer inclusive sliding window (use -1 to disable)
     logits_soft_cap: float,  # FlashInfer logits softcap (use 0.0 to disable)
-    # VLM CUSTOM MASK (optional, for Gemma3 etc.)
-    # Contains bidirectional attention for image tokens. Sliding window is
-    # handled separately by the window_left parameter.
-    custom_mask: Optional[torch.Tensor],
 ) -> torch.Tensor:
     # reshape to standard [b*s, n_heads, head_dim] layout
     head_dim = k_cache.shape[-1]
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/vlm_mask_ops.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/vlm_mask_ops.py
@@ -57,8 +57,10 @@ def create_attention_mask(
     # Dispatch to model-specific generator
     generator = VlmMaskGeneratorRegistry.get(model_type)
     if generator is None:
-        # No model-specific generator - return empty mask (no custom masking)
-        return torch.empty(0, dtype=torch.bool, device=token_info.device)
+        raise ValueError(
+            f"No model-specific generator found for model type: {model_type}. \
+        Registered model types: {VlmMaskGeneratorRegistry.registered_model_types()}."
+        )
 
     return generator(token_info, qo_indptr, seq_len, sliding_window)
 
@@ -163,7 +165,7 @@ def _gemma3_mask_impl(
     return torch.cat(masks).contiguous()
 
 
-@VlmMaskGeneratorRegistry.register("gemma3")
+@VlmMaskGeneratorRegistry.register("gemma3_text")
 def generate_gemma3_vlm_mask(
     image_token_mask: Tensor,
     qo_indptr: Tensor,
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/vlm_mask_registry.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/vlm_mask_registry.py
@@ -128,3 +128,8 @@ def has(cls, model_type: str) -> bool:
             True if a generator is registered, False otherwise.
         """
         return model_type in cls._registry
+
+    @classmethod
+    def registered_model_types(cls) -> list:
+        """Return a list of all registered model types."""
+        return list(cls._registry.keys())
diff --git a/tensorrt_llm/_torch/auto_deploy/export/export.py b/tensorrt_llm/_torch/auto_deploy/export/export.py
@@ -227,6 +227,7 @@ def run_forward_for_capture(
     *,
     patch_configs: Optional[Dict[str, Union[dict, Any]]] = None,
     patch_list: Optional[List[str]] = None,
+    post_export_callback: Optional[Callable[[nn.Module], None]] = None,
 ) -> nn.Module:
     """A wrapper to run the provided closure over the model on the meta device with patches.
 
@@ -244,6 +245,8 @@ def run_forward_for_capture(
         patch_configs: Optional patch configurations. If None, all registered patches
                       will be applied with default settings.
         patch_list: Optional list of patch names to apply with default settings.
+        post_export_callback: Optional callback called after capture but before patches are reverted.
+                             Receives the captured module as argument.
     """
     # run capture with patches and lifted to meta
     with apply_export_patches(patch_configs, patch_list), lift_to_meta(model) as state_dict:
@@ -259,6 +262,10 @@ def run_forward_for_capture(
             else:
                 mod_after_capture = capture_fn(model, args, kwargs)
 
+        # Call post_export_callback while patches are still active
+        if post_export_callback is not None:
+            post_export_callback(mod_after_capture)
+
         # load state_dict into egm
         # NOTE: export might have removed unused params/buffers (hence we allow unexpected keys)
         if mod_after_capture is not model:
@@ -283,6 +290,7 @@ def torch_export_to_gm(
     strict: bool = False,
     patch_configs: Optional[Dict[str, Union[dict, Any]]] = None,
     patch_list: Optional[List[str]] = None,
+    post_export_callback: Optional[Callable[[nn.Module], None]] = None,
 ) -> fx.GraphModule:
     """torch's export with wrapping into GraphModule + useful additions to the resulting module.
 
@@ -306,6 +314,8 @@ def torch_export_to_gm(
                       will be applied with default settings.
         patch_list: Optional list of patch names to apply with default settings.
                    Cannot be used together with patch_configs.
+        post_export_callback: Optional callback called after export but before patches are reverted.
+                             Receives the exported GraphModule as argument.
     """
 
     def _capture_fn(model, args, kwargs):
@@ -316,7 +326,14 @@ def _capture_fn(model, args, kwargs):
 
     # run capture with export
     egm = run_forward_for_capture(
-        model, _capture_fn, args, kwargs, clone, patch_list=patch_list, patch_configs=patch_configs
+        model,
+        _capture_fn,
+        args,
+        kwargs,
+        clone,
+        patch_list=patch_list,
+        patch_configs=patch_configs,
+        post_export_callback=post_export_callback,
     )
 
     # Export strips away all methods not traced during forward. The model could have
diff --git a/tensorrt_llm/_torch/auto_deploy/models/factory.py b/tensorrt_llm/_torch/auto_deploy/models/factory.py
@@ -67,9 +67,24 @@ def _init_dynamic_shape_lookup(self) -> Dict[str, DynamicShape]:
         """Initialize the lookup for the dynamic shapes of keyword arguments."""
         raise NotImplementedError("Subclasses must implement this method.")
 
+    def post_export(self, sub_mod: nn.Module, sub_gm: GraphModule):
+        """Called after export but BEFORE patches are reverted.
+
+        Args:
+            sub_mod: The submodule from which the graph was captured+exported.
+            sub_gm: The graph module that was exported.
+
+        This method is called while export patches are still active, allowing access to
+        patch-set metadata on the module (e.g., _vlm_input_names). Override this method
+        to set metadata on the GraphModule that depends on patch state.
+
+        Default implementation does nothing.
+        """
+        pass
+
     @abstractmethod
     def post_process(self, sub_mod: nn.Module, sub_gm: GraphModule):
-        """Post-process the subgraph module.
+        """Post-process the subgraph module AFTER patches are reverted.
 
         Args:
             sub_mod: The submodule from which the graph was captured+exported.
diff --git a/tensorrt_llm/_torch/auto_deploy/models/hf.py b/tensorrt_llm/_torch/auto_deploy/models/hf.py
@@ -560,6 +560,14 @@ def __call__(self, module, state_dict, *args, **kwargs) -> None:
 class TextModelExportInfo(SubModuleExportInfo):
     """An export configuration for the text model portion of a VLM."""
 
+    def post_export(self, sub_mod: nn.Module, sub_gm: GraphModule):
+        """Called after export but BEFORE patches are reverted.
+
+        Sets VLM metadata on the GraphModule while patches are still active,
+        so we can read _vlm_input_names from the module class.
+        """
+        self._set_vlm_metadata(sub_mod, sub_gm)
+
     def post_process(self, sub_mod: nn.Module, sub_gm: GraphModule):
         """Post-process the subgraph module and make sure the embedding remains available."""
         # make sure get_input_embeddings function is available in the graph module
@@ -588,10 +596,6 @@ def post_process(self, sub_mod: nn.Module, sub_gm: GraphModule):
             torch._assert, args=(n_embed_tokens, "Avoid embedding getting deleted from graph.")
         )
 
-        # Set VLM metadata on the GraphModule for runtime use.
-        # This is read by ADExecutor to determine which inputs to inject from multimodal_data.
-        self._set_vlm_metadata(sub_mod, sub_gm)
-
     def _set_vlm_metadata(self, sub_mod: nn.Module, sub_gm: GraphModule):
         """Set VLM-related metadata on the GraphModule.
 
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -742,7 +742,6 @@ def _find_vlm_graphmodule(mod: torch.nn.Module) -> Optional[torch.nn.Module]:
     # Store on engine for external access
     engine._vlm_inputs = vlm_inputs
     engine._vlm_model_type = vlm_model_type
-
     # Detect if the model is a VLM that expects custom masks
     # This is relevant for FlashInfer backend with VLM models
     engine._expects_vlm_custom_masks = (
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py
@@ -192,6 +192,11 @@ def _is_child(child: str, parent: str) -> bool:
             # torch.export can get confused by keyword arguments that are not explicitly defined in
             # the signature but are captured through generic **kwargs. By overwriting the signature,
             # we ensure each argument is explicitly defined in the signature.
+
+            # Create callback to call post_export while patches are still active
+            def _post_export_cb(exported_gm):
+                e_info.post_export(sub_mod, exported_gm)
+
             with set_exact_signature(sub_mod, captured_kwargs):
                 sub_gm = torch_export_to_gm(
                     sub_mod,
@@ -201,6 +206,7 @@ def _is_child(child: str, parent: str) -> bool:
                     clone=self.config.clone_state_dict,
                     strict=self.config.strict,
                     patch_list=self.config.patch_list,
+                    post_export_callback=_post_export_cb,
                 )
 
             # Ensure runtime calls from HF into this exported GraphModule do not fail due to
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py b/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py
@@ -184,18 +184,18 @@ def _maybe_init_vlm_custom_mask(
 
         self._vlm_custom_mask_node = custom_mask_node
 
-    def _maybe_append_flashinfer_vlm_custom_mask(self, cached_attn_op, args: Tuple) -> Tuple:
-        """Append FlashInfer VLM custom mask arg (or None) to `args`.
+    def _get_vlm_custom_mask_node(self, cached_attn_op) -> Optional[Node]:
+        """Get the VLM custom mask node for FlashInfer attention ops.
 
         All layers receive the same mask - it provides bidirectional attention
         for image tokens. Sliding window is handled separately by window_left.
+
+        Returns:
+            The custom mask node, or None if not a FlashInfer op or no VLM.
         """
         if not self._is_flashinfer_cached_attn_op(cached_attn_op):
-            return args
-
-        # Append the custom mask node (or None if no VLM)
-        custom_mask = getattr(self, "_vlm_custom_mask_node", None)
-        return (*args, custom_mask)
+            return None
+        return getattr(self, "_vlm_custom_mask_node", None)
 
     def _process_metadata_extra(
         self, gm: GraphModule, cm: CachedSequenceInterface, any_source_attn_node: Node
@@ -239,18 +239,16 @@ def _insert_cached_attn_node(
         """Insert a cached attention node into the graph."""
         with gm.graph.inserting_before(attn_node):
             cached_attn_op = self.attn_descriptor.get_cached_attention_op()
+            custom_mask_node = self._get_vlm_custom_mask_node(cached_attn_op)
             args = (
                 *qkv_nodes,
+                custom_mask_node,
                 *meta_nodes_std,
                 *meta_nodes_extra,
                 *cache_nodes,
                 *buffer_nodes,
                 *constants,
             )
-            # FlashInfer cached attention op optionally accepts a custom mask arg for VLM.
-            # The mask provides bidirectional attention for image tokens. Sliding window
-            # is handled separately by FlashInfer's window_left parameter.
-            args = self._maybe_append_flashinfer_vlm_custom_mask(cached_attn_op, args)
             cached_attn_node = gm.graph.call_function(
                 cached_attn_op,
                 args=args,
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py
@@ -98,6 +98,8 @@ def test_flashinfer_attention_op_context(seq_length, n_heads, batch_size, dtype,
         q,
         k,
         v,
+        # VLM CUSTOM MASK
+        None,  # custom_mask
         # STANDARD METADATA
         batch_info,
         qo_indptr,
@@ -118,7 +120,6 @@ def test_flashinfer_attention_op_context(seq_length, n_heads, batch_size, dtype,
         1.0,
         -1,  # window_left (disabled)
         0.0,  # logits_soft_cap (disabled)
-        None,  # custom_mask
     )
 
     # Use torch backend as clean reference
@@ -236,6 +237,8 @@ def test_flashinfer_attention_op_decode(
         q,
         k,
         v,
+        # VLM CUSTOM MASK
+        None,  # custom_mask
         # STANDARD METADATA
         batch_info,
         qo_indptr,
@@ -256,7 +259,6 @@ def test_flashinfer_attention_op_decode(
         1.0,
         -1,  # window_left (disabled)
         0.0,  # logits_soft_cap (disabled)
-        None,  # custom_mask
     )
 
     assert torch.allclose(
@@ -364,6 +366,8 @@ def test_flashinfer_attention_context_and_generate(
         q_1,
         k_1,
         v_1,
+        # VLM CUSTOM MASK
+        None,  # custom_mask
         # STANDARD METADATA
         batch_info,
         qo_indptr,
@@ -384,7 +388,6 @@ def test_flashinfer_attention_context_and_generate(
         1.0,
         -1,  # window_left (disabled)
         0.0,  # logits_soft_cap (disabled)
-        None,  # custom_mask
     )
 
     # Generate reference outputs
@@ -446,6 +449,8 @@ def test_flashinfer_attention_context_and_generate(
         q_3,
         k_3,
         v_3,
+        # VLM CUSTOM MASK
+        None,  # custom_mask
         # STANDARD METADATA
         batch_info,
         qo_indptr,
@@ -466,7 +471,6 @@ def test_flashinfer_attention_context_and_generate(
         1.0,
         -1,  # window_left (disabled)
         0.0,  # logits_soft_cap (disabled)
-        None,  # custom_mask
     )
 
     # Generate reference outputs
@@ -564,6 +568,8 @@ def test_flashinfer_attention_op_context_input_pos(seq, batch_size, n_heads, dty
         q,
         k,
         v,
+        # VLM CUSTOM MASK
+        None,  # custom_mask
         # STANDARD METADATA
         batch_info,
         qo_indptr,
@@ -584,7 +590,6 @@ def test_flashinfer_attention_op_context_input_pos(seq, batch_size, n_heads, dty
         1.0,
         -1,  # window_left (disabled)
         0.0,  # logits_soft_cap (disabled)
-        None,  # custom_mask
     )
 
     # Generate ref
@@ -720,6 +725,8 @@ def test_flashinfer_attention_with_fp8_cache(
         q,
         k,
         v,
+        # VLM CUSTOM MASK
+        None,  # custom_mask
         # STANDARD METADATA
         batch_info,
         qo_indptr,
@@ -740,7 +747,6 @@ def test_flashinfer_attention_with_fp8_cache(
         V_SCALE,
         -1,  # window_left (disabled)
         0.0,  # logits_soft_cap (disabled)
-        None,  # custom_mask
     )
 
     y = flashinfer_output.view(BATCH_SIZE, SEQ_LEN, N_HEADS, D_HEAD)
@@ -824,6 +830,8 @@ def test_flashinfer_attention_with_paged_kvcache(seq_lengths, n_heads, dtype, de
         q,
         k,
         v,
+        # VLM CUSTOM MASK
+        None,  # custom_mask
         # STANDARD METADATA
         batch_info,
         qo_indptr,
@@ -844,7 +852,6 @@ def test_flashinfer_attention_with_paged_kvcache(seq_lengths, n_heads, dtype, de
         1.0,
         -1,  # window_left (disabled)
         0.0,  # logits_soft_cap (disabled)
-        None,  # custom_mask
     )
 
     # Compute reference
@@ -914,6 +921,8 @@ def test_flashinfer_attention_with_paged_kvcache(seq_lengths, n_heads, dtype, de
         q_gen,
         k_gen,
         v_gen,
+        # VLM CUSTOM MASK
+        None,  # custom_mask
         # STANDARD METADATA
         batch_info,
         qo_indptr2,
@@ -934,7 +943,6 @@ def test_flashinfer_attention_with_paged_kvcache(seq_lengths, n_heads, dtype, de
         1.0,
         -1,  # window_left (disabled)
         0.0,  # logits_soft_cap (disabled)
-        None,  # custom_mask
     )
 
     # Compute reference