vllm-project · DarkLight1337 · May 9, 2025 · Mar 28, 2025 · Mar 28, 2025 · Mar 29, 2025
@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="7e1ed08"
+ARG AITER_BRANCH="5a77249"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base

@@ -102,7 +102,10 @@ def test_env(
                                                    block_size,
                                                    False,
                                                    use_mla=use_mla)
-                        assert backend.get_name() == name
+                        if use_v1 and name != "TRITON_MLA":
+                            assert backend.get_name() == f"{name}_VLLM_V1"
+                        else:
+                            assert backend.get_name() == name
                     else:
                         with pytest.raises(ValueError) as exc_info:
                             get_attn_backend(16,

diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
@@ -16,7 +16,7 @@
 from vllm.attention.backends.utils import (compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
-from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd,
+from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_forward,
                                                get_aiter_mla_metadata)
 
 if TYPE_CHECKING:
@@ -404,9 +404,9 @@ def _forward_decode(
 
         kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
 
-        aiter_mla_decode_fwd(q, kv_buffer, o, self.scale,
-                             attn_metadata.paged_kv_indptr,
-                             attn_metadata.paged_kv_indices,
-                             attn_metadata.paged_kv_last_page_lens)
+        aiter_mla_decode_forward(q, kv_buffer, o, self.scale,
+                                 attn_metadata.paged_kv_indptr,
+                                 attn_metadata.paged_kv_indices,
+                                 attn_metadata.paged_kv_last_page_lens)
 
         return self._v_up_proj_and_o_proj(o)
@@ -4,6 +4,9 @@
 
 import torch
 
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+
 
 def get_aiter_mla_metadata(max_batch_size: int, block_size: int,
                            max_block_per_batch: int,
@@ -20,7 +23,7 @@ def get_aiter_mla_metadata(max_batch_size: int, block_size: int,
     return paged_kv_indices, paged_kv_indptr, paged_kv_last_page_lens
 
 
-def aiter_mla_decode_fwd(
+def aiter_mla_decode_forward(
     q: torch.Tensor,
     kv_buffer: torch.Tensor,
     o: torch.Tensor,
@@ -30,6 +33,28 @@ def aiter_mla_decode_fwd(
     kv_last_page_lens: Optional[torch.Tensor] = None,
     logit_cap: float = 0.0,
 ):
+
+    torch.ops.vllm.rocm_aiter_mla_decode_fwd(q,
+                                             kv_buffer.view(
+                                                 -1, 1, 1, q.shape[-1]),
+                                             o,
+                                             kv_indptr,
+                                             kv_indices,
+                                             kv_last_page_lens,
+                                             sm_scale=sm_scale,
+                                             logit_cap=logit_cap)
+
+
+def mla_decode_fwd_impl(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    kv_indptr: Optional[torch.Tensor] = None,
+    kv_indices: Optional[torch.Tensor] = None,
+    kv_last_page_lens: Optional[torch.Tensor] = None,
+    sm_scale: float = 1.0,
+    logit_cap: float = 0.0,
+) -> None:
     from aiter.mla import mla_decode_fwd
 
     mla_decode_fwd(q,
@@ -40,3 +65,24 @@ def aiter_mla_decode_fwd(
                    kv_last_page_lens,
                    sm_scale=sm_scale,
                    logit_cap=logit_cap)
+
+
+def mla_decode_fwd_fake(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    kv_indptr: Optional[torch.Tensor] = None,
+    kv_indices: Optional[torch.Tensor] = None,
+    kv_last_page_lens: Optional[torch.Tensor] = None,
+    sm_scale: float = 1.0,
+    logit_cap: float = 0.0,
+) -> None:
+    pass
+
+
+if current_platform.is_rocm():
+    direct_register_custom_op(op_name="rocm_aiter_mla_decode_fwd",
+                              op_func=mla_decode_fwd_impl,
+                              mutates_args=["o"],
+                              fake_impl=mla_decode_fwd_fake,
+                              tags=[torch.Tag.needs_fixed_stride_order])
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1298,6 +1298,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             "FLASHMLA",
             "FLASHINFER",
             "FLASHINFER_VLLM_V1",
+            "ROCM_AITER_MLA",
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -84,6 +84,7 @@
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
     VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True
+    VLLM_ROCM_EXECUTE_MODEL_TIMEOUT: int = 250  #s
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
@@ -488,6 +489,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_RPC_TIMEOUT":
     lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),
 
+    # Time in seconds for the model execution in ROCm platforms.
+    "VLLM_ROCM_EXECUTE_MODEL_TIMEOUT":
+    lambda: int(os.getenv("VLLM_ROCM_EXECUTE_MODEL_TIMEOUT", "250")),
+
     # a list of plugin names to load, separated by commas.
     # if this is not set, it means all plugins will be loaded
     # if this is set to an empty string, no plugins will be loaded

@@ -145,7 +145,7 @@ def rocm_aiter_fmoe_fp8_blockscale_g1u1_fake(
         block_shape: List[int],
         smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-    return torch.empty_like(a1, dtype=torch.bf16)
+    return torch.empty_like(a1, dtype=hidden_states_dtype)
 
 
 def rocm_aiter_asm_moe_impl(hidden_states: torch.Tensor,

diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -39,7 +39,8 @@ class _Backend(enum.Enum):
     TRITON_ATTN_VLLM_V1 = enum.auto()
     XFORMERS = enum.auto()
     ROCM_FLASH = enum.auto()
-    ROCM_AITER_MLA = enum.auto()
+    ROCM_AITER_MLA = enum.auto()  # Supported by V1
+    ROCM_AITER_MLA_VLLM_V1 = enum.auto()
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
     TRITON_MLA = enum.auto()  # Supported by V1

@@ -155,10 +155,22 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                     raise ValueError(
                         f" The selected backend, {selected_backend.name},"
                         f"does not support block size {block_size}.")
-            elif selected_backend == _Backend.ROCM_AITER_MLA:
+            elif selected_backend == _Backend.ROCM_AITER_MLA \
+                or selected_backend == _Backend.ROCM_AITER_MLA_VLLM_V1:
                 if block_size == 1:
-                    logger.info("Using AITER MLA backend.")
-                    return "vllm.attention.backends.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+                    if use_v1:
+                        logger.info("Using AITER MLA backend on V1 engine.")
+                        logger.warning(
+                            "Increasing the model execution timeout"
+                            "using the VLLM_ROCM_EXECUTE_MODEL_TIMEOUT"
+                            "environment variable is recommended"
+                            "if timeout error is encountered"
+                            "when running %s"
+                            "backend on V1 engine.", selected_backend)
+                        return "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+                    else:
+                        logger.info("Using AITER MLA backend")
+                        return "vllm.attention.backends.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
                 else:
                     raise ValueError(
                         f" The selected backend, {selected_backend.name},"

@@ -494,11 +494,12 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
                 max_context_chunk = (self.chunked_prefill_workspace_size //
                                      num_prefills_with_context_cpu)
 
-                # align max_context_chunk to page_size by rounding down,
-                # currently the `gather_cache` kernel cannot handle
-                # `context_chunk_starts` that are not aligned to page_size
-                max_context_chunk = round_down(max_context_chunk,
-                                               self.page_size)
+                if self.aot_schedule:
 self.aot_schedule = is_vllm_fa and (get_flash_attn_version() == 3) 
 # Dont try to access the runner on AMD 
 if self.aot_schedule: 
     self.page_size = self.runner.block_size 
 self.aot_schedule = is_vllm_fa and (get_flash_attn_version() == 3) 
  
 # Dont try to access the runner on AMD 
 if self.aot_schedule: 
     self.page_size = self.runner.block_size 
+                    # align max_context_chunk to page_size by rounding down,
+                    # currently the `gather_cache` kernel cannot handle
+                    # `context_chunk_starts` that are not aligned to page_size
+                    max_context_chunk = round_down(max_context_chunk,
+                                                   self.page_size)
 
                 assert max_context_chunk > 0
                 num_chunks = cdiv(max_context_len_cpu, max_context_chunk)