[None][feat] AutoDeploy: Perf optimization for Attention and rmsnorm (#9719)

nvchenghaoz · web-flow · commit d6f95a436302 · 2025-12-05T12:59:04.000-08:00
Signed-off-by: Chenghao Zhang &lt;211069071+nvchenghaoz@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py
@@ -7,6 +7,7 @@
 from torch._subclasses import FakeTensor
 from torch.fx import Node
 
+from ...flashinfer_utils import get_env_enable_pdl
 from ..utils.cuda_graph import cuda_graph_state
 from ..utils.logger import ad_logger
 from ..utils.node_utils import extract_op_args
@@ -256,9 +257,9 @@ def flashinfer_mha_with_cache(
     q_shape_og = q.shape
     b, s = q_shape_og[:2]
 
-    q = q.contiguous().view(b * s, -1, head_dim)
-    k = k.contiguous().view(b * s, -1, head_dim)
-    v = v.contiguous().view(b * s, -1, head_dim)
+    q = q.reshape(b * s, -1, head_dim)
+    k = k.reshape(b * s, -1, head_dim)
+    v = v.reshape(b * s, -1, head_dim)
 
     n_heads = q.shape[1]
     n_kv_heads = k.shape[1]
@@ -275,11 +276,12 @@ def flashinfer_mha_with_cache(
         sm_scale=scale,
     )
 
-    # Assuming k_scale = v_scale = 1.0, we just have to cast k and v to fp8 before appending to kv cache
+    # Assuming k_scale = v_scale = 1.0
     k_scale, v_scale = 1.0, 1.0
+    # k = (k / k_scale).to(torch.float8_e4m3fn) if k_scale != 1.0, same for v
     if k_cache.dtype == torch.float8_e4m3fn:
-        k = (k / k_scale).to(torch.float8_e4m3fn)
-        v = (v / v_scale).to(torch.float8_e4m3fn)
+        k = k.to(torch.float8_e4m3fn)
+        v = v.to(torch.float8_e4m3fn)
 
     flashinfer.page.append_paged_kv_cache(
         k,
@@ -300,7 +302,10 @@ def flashinfer_mha_with_cache(
         paged_kv_last_page_len,
         pp,
     )
-    y = wrapper.run(q, (k_cache, v_cache), k_scale=k_scale, v_scale=v_scale)
+
+    y = wrapper.run(
+        q, (k_cache, v_cache), k_scale=k_scale, v_scale=v_scale, enable_pdl=get_env_enable_pdl()
+    )
 
     return y.view(q_shape_og)  # [b,s,n*h_d] or [b,s, n, h_d]
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py
@@ -3,6 +3,7 @@
 import flashinfer
 import torch
 
+from ...flashinfer_utils import get_env_enable_pdl
 from ...modules.mamba.layernorm_gated import _layer_norm_fwd
 from .triton_kernels.rms_norm import rms_norm
 
@@ -21,7 +22,7 @@ def flashinfer_rmsnorm(input: torch.Tensor, weight: torch.Tensor, eps: float) ->
     """
     # Flashinfer rmsnorm expects a 2D input
     input_flat = input.reshape(-1, input.shape[-1])
-    rmsnorm_flat = flashinfer.norm.rmsnorm(input_flat, weight, eps)
+    rmsnorm_flat = flashinfer.norm.rmsnorm(input_flat, weight, eps, enable_pdl=get_env_enable_pdl())
     return rmsnorm_flat.reshape(input.shape)