NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/export/library/unified_attn.py‎
Lines changed: 81 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/export/library/unified_attn.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/patches/decilm.py‎
Lines changed: 0 additions & 16 deletions b/‎tensorrt_llm/_torch/auto_deploy/models/patches/decilm.py‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/patches/gptoss.py‎
Lines changed: 0 additions & 100 deletions b/‎tensorrt_llm/_torch/auto_deploy/models/patches/gptoss.py‎
Lines changed: 0 additions & 100 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/library/kvcache_transformers.py‎
Lines changed: 3 additions & 10 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/library/kvcache_transformers.py‎
Lines changed: 3 additions & 10 deletions
@@ -0,0 +1,81 @@
+"""Patch for torch.export.export to detect and replace hf attention_interface with unified attention."""
+
+from typing import Optional
+
+import torch
+import torch.export as te
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+from ..interface import BaseExportPatch, ExportPatchRegistry
+
+# Kwargs mapping for HF attention_interface to auto_deploy::torch_attention
+HF_ATTN_KWARGS_MAPPING = {
+    "dropout": "dropout_p",
+    "is_causal": "is_causal",
+    "scaling": "scale",
+    "scale": "scale",
+    "s_aux": "sinks",
+    "sinks": "sinks",
+    "sliding_window": "sliding_window",
+    "logit_cap": "logit_cap",
+}
+
+
+def torch_attention_hf_wrapper(
+    self: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    **kwargs,
+):
+    """Wrapper of auto_deploy::torch_attention with HF attention_interface signature."""
+
+    # Convert from [batch, num_heads, seq_len, head_dim] to [batch, seq_len, num_heads, head_dim]
+    query_states = query.transpose(1, 2)
+    key_states = key.transpose(1, 2)
+    value_states = value.transpose(1, 2)
+
+    ad_attn_kwargs = {
+        HF_ATTN_KWARGS_MAPPING[k]: v for k, v in kwargs.items() if k in HF_ATTN_KWARGS_MAPPING
+    }
+
+    attn_output = torch.ops.auto_deploy.torch_attention(
+        query_states,
+        key_states,
+        value_states,
+        attn_mask=attention_mask,
+        layout="bsnd",
+        **ad_attn_kwargs,
+    )
+
+    return attn_output, None
+
+
+@ExportPatchRegistry.register("unified_attn")
+class UnifiedAttnPatch(BaseExportPatch):
+    """
+    Patch on torch.export.export to replace attention_interface with torch.ops.auto_deploy.torch_attention.
+    """
+
+    def _apply_patch(self):
+        """Apply the te.export patch."""
+        # Store original torch.export.export
+        self.original_values["te.export"] = te.export
+
+        # Register the wrapper function
+        ALL_ATTENTION_FUNCTIONS.register("ad_unified_attn", torch_attention_hf_wrapper)
+
+        def _export_with_unified_attn(model, *args, **kwargs):
+            # torch_export_to_gm is called at both export stage and attn matching stage
+            # we only patch attn implementation for export stage
+            if hasattr(model, "config") and hasattr(model.config, "_attn_implementation"):
+                model.config._attn_implementation = "ad_unified_attn"
+            return self.original_values["te.export"](model, *args, **kwargs)
+
+        # Apply patch
+        te.export = _export_with_unified_attn
+
+    def _revert_patch(self):
+        """Revert the te.export patch."""
+        te.export = self.original_values["te.export"]
@@ -12,6 +12,7 @@
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
 from ...custom_ops.attention_interface import AttentionDescriptor, Constant
+from ...export.library.unified_attn import HF_ATTN_KWARGS_MAPPING
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
 from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry
@@ -39,16 +40,8 @@ def fake_profiler_mha(
 
     # construct kwargs for bsnd_grouped_sdpa
     node_kwargs = {"attn_mask": attention_mask, "is_causal": is_causal}
-    kwargs_to_op = {
-        "dropout": "dropout_p",
-        "scaling": "scale",
-        "scale": "scale",
-        "s_aux": "sinks",
-        "sinks": "sinks",
-        "sliding_window": "sliding_window",
-        "logit_cap": "logit_cap",
-    }
-    for k_kwargs, k_op_kwargs in kwargs_to_op.items():
+
+    for k_kwargs, k_op_kwargs in HF_ATTN_KWARGS_MAPPING.items():
         if k_kwargs in kwargs:
             node_kwargs[k_op_kwargs] = kwargs[k_kwargs]