[Bugfix] Fix default enable for CUTLASS MLA on SM100 (#22738)

mgoin · web-flow · commit b1361c7273f6 · 2025-08-12T21:22:05.000-07:00
Signed-off-by: mgoin &lt;mgoin64@gmail.com&gt;
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -152,6 +152,9 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 if cls.is_device_capability(100):
                     # Blackwell => Force CutlassMLA.
                     use_cutlass_mla = True
+                    # TODO: This does not work, because the
+                    # global_force_attn_backend_context_manager is not set.
+                    # See vllm/attention/selector.py:_cached_get_attn_backend
                     envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA"
                 else:
                     # Not Blackwell
@@ -217,7 +220,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
         if use_mla:
             # TODO(lucas): refactor to be more concise
             #  we should probably consider factoring out V1 here
-            if selected_backend == _Backend.CUTLASS_MLA:
+            if selected_backend == _Backend.CUTLASS_MLA or (
+                    cls.is_device_capability(100) and selected_backend is None
+                    and block_size == 128):
                 if use_v1:
                     logger.info_once("Using Cutlass MLA backend on V1 engine.")
                     return ("vllm.v1.attention.backends.mla."