[BugFix] Fix MLA + V1 + TP==1 causing reinitialization of cuda context (#14910)

LucasWilkinson · web-flow · commit 1e799b7ec1b1 · 2025-03-17T03:35:37.000Z
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -152,7 +152,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             # here
             use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
                 or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
-            from vllm.attention.backends.flashmla import is_flashmla_supported
+            from vllm.attention.ops.flashmla import is_flashmla_supported
             if use_flashmla and is_flashmla_supported()[0] \
                 and cache_config.block_size != 64:
                 cache_config.block_size = 64