add mha dispatch logic

guanbao · guanbao · commit 7ab33a438cf0 · 2025-10-30T10:55:29.000+08:00
Signed-off-by: guanbao &lt;gyu@amd.com&gt;
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -108,6 +108,7 @@
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_MLA: bool = True
+    VLLM_ROCM_USE_AITER_TRITON_MLA: bool = False
     VLLM_ROCM_USE_AITER_MHA: bool = True
     VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: bool = False
     VLLM_ROCM_USE_TRITON_ROPE: bool = True
@@ -879,6 +880,11 @@ def get_vllm_port() -> int | None:
     "VLLM_ROCM_USE_AITER_MLA": lambda: (
         os.getenv("VLLM_ROCM_USE_AITER_MLA", "True").lower() in ("true", "1")
     ),
+    # Whether to use aiter triton mla ops.
+    # By default is disabled.
+    "VLLM_ROCM_USE_AITER_TRITON_MLA": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_TRITON_MLA", "False").lower() in ("true", "1")
+    ),
     # Whether to use aiter mha ops.
     # By default is enabled.
     "VLLM_ROCM_USE_AITER_MHA": lambda: (
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -10,6 +10,7 @@
 from vllm.attention.backends.abstract import AttentionLayer
 from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
 from vllm.config import VllmConfig
+from vllm.platforms.rocm import on_gfx950
 from vllm.utils import cdiv
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
@@ -243,23 +244,59 @@ def __init__(
                 "alibi_slopes, sliding_window, logits_soft_cap"
             )
 
-        from aiter import flash_attn_varlen_func
+        from aiter import flash_attn_varlen_func as aiter_flash_attn_varlen_func
+        from aiter.ops.triton.mha import (
+            flash_attn_varlen_func as triton_flash_attn_varlen_func,
+        )
+
+        self.triton_flash_attn_varlen_func = triton_flash_attn_varlen_func
+        self.aiter_flash_attn_varlen_func = aiter_flash_attn_varlen_func
+
+    def _use_triton_mha(self, q, k, **kwargs) -> bool:
+        # TODO: refine dispatch logic on other non-GFX950 GPUs
+        if not on_gfx950():
+            return False
+
+        cu_seqlens_q = kwargs.get("cu_seqlens_q")
+        max_seqlen_q = kwargs.get("max_seqlen_q", q.size(0))
+        max_seqlen_k = kwargs.get("max_seqlen_k", k.size(0))
+
+        bs = cu_seqlens_q.shape[0] - 1 if cu_seqlens_q is not None else 1
+
+        # TODO: consider more comprehensive conditions here
+        use_triton_mha = bs <= 32
+        use_triton_mha = use_triton_mha and (max_seqlen_q <= 1024)
+        use_triton_mha = use_triton_mha and (max_seqlen_k <= 1024)
 
-        self.flash_attn_varlen_func = flash_attn_varlen_func
+        return use_triton_mha
 
     def _flash_attn_varlen_diff_headdims(
         self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
     ):
-        output = self.flash_attn_varlen_func(
-            q=q,
-            k=k,
-            v=v,
-            softmax_scale=softmax_scale,
-            return_lse=return_softmax_lse,
-            **kwargs,
-        )
-
-        return output
+        # force to use triton mha if env var is set, otherwise do dispatch
+        if envs.VLLM_ROCM_USE_AITER_TRITON_MLA or self._use_triton_mha(q, k, **kwargs):
+            result = self.triton_flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=v,
+                softmax_scale=softmax_scale,
+                return_lse=return_softmax_lse,
+                **kwargs,
+            )
+            if return_softmax_lse and type(result) is tuple:
+                output, lse = result
+                return (output, lse.T.contiguous())
+            return result
+        else:
+            output = self.aiter_flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=v,
+                softmax_scale=softmax_scale,
+                return_lse=return_softmax_lse,
+                **kwargs,
+            )
+            return output
 
     def _forward_decode(
         self,