Skip to content

Commit b1361c7

Browse files
authored
[Bugfix] Fix default enable for CUTLASS MLA on SM100 (#22738)
Signed-off-by: mgoin <[email protected]>
1 parent 4f0f844 commit b1361c7

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

vllm/platforms/cuda.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,9 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
152152
if cls.is_device_capability(100):
153153
# Blackwell => Force CutlassMLA.
154154
use_cutlass_mla = True
155+
# TODO: This does not work, because the
156+
# global_force_attn_backend_context_manager is not set.
157+
# See vllm/attention/selector.py:_cached_get_attn_backend
155158
envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA"
156159
else:
157160
# Not Blackwell
@@ -217,7 +220,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
217220
if use_mla:
218221
# TODO(lucas): refactor to be more concise
219222
# we should probably consider factoring out V1 here
220-
if selected_backend == _Backend.CUTLASS_MLA:
223+
if selected_backend == _Backend.CUTLASS_MLA or (
224+
cls.is_device_capability(100) and selected_backend is None
225+
and block_size == 128):
221226
if use_v1:
222227
logger.info_once("Using Cutlass MLA backend on V1 engine.")
223228
return ("vllm.v1.attention.backends.mla."

0 commit comments

Comments
 (0)