@@ -162,15 +162,15 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
162
162
if cls .is_device_capability (100 ):
163
163
# Blackwell => Force CutlassMLA.
164
164
use_cutlass_mla = True
165
- envs .VLLM_ATTENTION_BACKEND = "CUTLASS_MLA_VLLM_V1 "
165
+ envs .VLLM_ATTENTION_BACKEND = "CUTLASS_MLA "
166
166
else :
167
167
# Not Blackwell
168
168
use_flashmla = True
169
169
else :
170
170
# Forced case
171
171
use_flashmla = (envs .VLLM_ATTENTION_BACKEND == "FLASHMLA" )
172
172
use_cutlass_mla = (
173
- envs .VLLM_ATTENTION_BACKEND == "CUTLASS_MLA_VLLM_V1 " )
173
+ envs .VLLM_ATTENTION_BACKEND == "CUTLASS_MLA " )
174
174
175
175
from vllm .attention .ops .flashmla import is_flashmla_supported
176
176
if use_flashmla and is_flashmla_supported ()[0 ] \
@@ -182,7 +182,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
182
182
if use_cutlass_mla and cache_config .block_size != 128 :
183
183
cache_config .block_size = 128
184
184
logger .info ("Forcing kv cache block size to 128 for "
185
- "CUTLASS_MLA_VLLM_V1 backend." )
185
+ "CUTLASS_MLA backend." )
186
186
187
187
compilation_config = vllm_config .compilation_config
188
188
if (envs .VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
@@ -211,9 +211,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
211
211
kv_cache_dtype , block_size , use_v1 ,
212
212
use_mla ) -> str :
213
213
if use_mla :
214
- # TODO(lucas): refactor to be more concise
214
+ # TODO(lucas): refactor to be more concise
215
215
# we should probably consider factoring out V1 here
216
- if selected_backend == _Backend .CUTLASS_MLA_VLLM_V1 :
216
+ if selected_backend == _Backend .CUTLASS_MLA :
217
217
if use_v1 :
218
218
logger .info_once ("Using Cutlass MLA backend on V1 engine." )
219
219
return ("vllm.v1.attention.backends.mla."
0 commit comments