no-pad-fas3

LucasWilkinson · LucasWilkinson · commit 2101882ff24c · 2025-02-11T19:32:14.000Z
Signed-off-by: Lucas Wilkinson &lt;lwilkinson@neuralmagic.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -581,7 +581,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
+          GIT_TAG 62cd67b571e806aa694a4c0f293d72a0f4717a97
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
@@ -29,6 +29,7 @@
     scaled_dequantize, scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import (
     DeepseekScalingRotaryEmbedding, RotaryEmbedding)
+from vllm.platforms import current_platform
 
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
@@ -182,6 +183,10 @@ def __init__(
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
         self.vllm_flash_attn_version = get_flash_attn_version()
+        # Currently different K headdim and V headdim is only supported for
+        # hopper devices
+        self.pad_v_head = not self.vllm_flash_attn_version >= 3 or \
+            current_platform.get_device_capability()[0] != 9
 
     def _v_up_proj_and_o_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
@@ -501,11 +506,10 @@ def _forward_prefill_flash(
 
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
-        # For MLA the v head dim is smaller than qk head dim so we pad out
-        # v with 0s to match the qk head dim
         v_dim = v.shape[-1]
-        pad_v = self.vllm_flash_attn_version < 3
-        if pad_v:
+        if self.pad_v_head:
+            # For MLA the v head dim is smaller than qk head dim so we pad out
+            # v with 0s to match the qk head dim
             v = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
                                         value=0)
 
@@ -522,7 +526,7 @@ def _forward_prefill_flash(
             fa_version=self.vllm_flash_attn_version,
         )
 
-        if pad_v:
+        if self.pad_v_head:
             attn_output = attn_output\
                 .view(-1, self.num_heads, q.shape[-1])[..., :v_dim]