[Bugfix] use flash attn on sm90 (#22933)

zyongye · mgoin · web-flow · commit 39cd09dc86ca · 2025-08-14T16:37:22.000-07:00
Signed-off-by: Yongye Zhu &lt;zyy1102000@gmail.com&gt;
Co-authored-by: Michael Goin &lt;mgoin64@gmail.com&gt;
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -316,7 +316,7 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
 
             # FlashAttention is the default for SM 8.0+ GPUs
             if cls.has_device_capability(80):
-                if has_sink:
+                if has_sink and not cls.is_device_capability(90):
                     logger.info_once("Using Triton backend on V1 engine.")
                     return TRITON_ATTN_VLLM_V1
                 if is_default_backend_supported := is_attn_backend_supported(