Merge remote-tracking branch 'origin/fix_env' into 0902_rc1

micah-wil · micah-wil · commit ba9d00ddf211 · 2025-08-28T22:54:58.000Z
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -828,7 +828,8 @@ def forward(
             use_custom = use_rocm_custom_paged_attention(
                 decode_query.dtype, head_size, block_size, gqa_ratio,
                 decode_meta.max_decode_seq_len, self.sliding_window,
-                self.kv_cache_dtype, self.alibi_slopes)
+                self.kv_cache_dtype,
+                self.alibi_slopes) and not is_rocm_aiter_paged_attn_enabled()
 
             if use_custom:
                 max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -144,9 +144,7 @@ def use_rocm_custom_paged_attention(
                 and (block_size == 16 or block_size == 32)
                 and (gqa_ratio >= 1 and gqa_ratio <= 16)
                 and max_seq_len <= 128 * 1024
-                and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
-                and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
-                         and envs.VLLM_ROCM_USE_AITER) and sinks is None)
+                and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) and sinks is None)
 
     else:
         return (ON_GFX11_GFX12 and (not envs.VLLM_USE_V1 or sliding_window == 0
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -89,23 +89,34 @@ def _vllm_layout_trans_kernel(
             tl.store(k_values_ptr + kv_values_off, k_vals, mask=block_mask)
             tl.store(v_values_ptr + kv_values_off, v_vals, mask=block_mask)
 
-    def vllm_layout_trans(b_query_lens_loc, b_seq_lens_loc, block_table,
-                          k_cache, v_cache, max_seq_len, k_scale, v_scale,
-                          output_dtype, total_tokens):
+    @torch.inference_mode()
+    def vllm_layout_trans(b_query_lens_loc,
+                          b_seq_lens_loc,
+                          block_table,
+                          k_cache,
+                          v_cache,
+                          max_seq_len,
+                          k_scale,
+                          v_scale,
+                          output_dtype,
+                          total_tokens,
+                          k_values=None,
+                          v_values=None):
         H_KV = v_cache.shape[2]
         D = v_cache.shape[3]
         BLOCK_SIZE = v_cache.shape[1]
-
-        k_values = torch.empty(
-            (total_tokens, H_KV, D),
-            dtype=output_dtype,
-            device=k_cache.device,
-        )
-        v_values = torch.empty(
-            (total_tokens, H_KV, D),
-            dtype=output_dtype,
-            device=v_cache.device,
-        )
+        if k_values is None:
+            k_values = torch.empty(
+                (total_tokens, H_KV, D),
+                dtype=output_dtype,
+                device=k_cache.device,
+            )
+        if v_values is None:
+            v_values = torch.empty(
+                (total_tokens, H_KV, D),
+                dtype=output_dtype,
+                device=v_cache.device,
+            )
 
         grid = (block_table.shape[0],
                 (max_seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE)
@@ -148,13 +159,14 @@ def flash_attn_varlen_func_impl(
         block_table: torch.Tensor,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
-        total_tokens: int = 0,
+        total_tokens: int,
+        k_values: Optional[torch.Tensor] = None,
+        v_values: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if total_tokens == 0:
-            total_tokens = int(cu_seqlens_k[-1].item())
         k, v = vllm_layout_trans(cu_seqlens_q, cu_seqlens_k, block_table,
                                  k_cache, v_cache, max_seqlen_k, k_scale,
-                                 v_scale, q.dtype, total_tokens)
+                                 v_scale, q.dtype, total_tokens, k_values,
+                                 v_values)
 
         output = aiter.flash_attn_varlen_func(
             q=q,
@@ -222,24 +234,27 @@ class AiterFlashAttentionMetadata:
     seq_lens: torch.Tensor
     slot_mapping: torch.Tensor
     block_table: torch.Tensor
-    cu_seq_lens: Optional[torch.Tensor]
 
     # For cascade attention.
     use_cascade: bool
     common_prefix_len: int
-    total_tokens: int
+    k_buffer: torch.Tensor
+    v_buffer: torch.Tensor
+    workspace_buffer: torch.Tensor
+    cu_seq_lens: torch.Tensor
 
 
 class AiterFlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[AiterFlashAttentionMetadata]):
-    cudagraph_support = AttentionCGSupport.ALWAYS
+    cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
         self.cache_config = vllm_config.cache_config
+        self.compilation_config = vllm_config.compilation_config
         self.device = device
 
         self.num_heads_q = self.model_config.get_num_attention_heads(
@@ -249,53 +264,68 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
         self.headdim = self.model_config.get_head_size()
         self.block_size = kv_cache_spec.block_size
         self.kv_cache_spec = kv_cache_spec
-        # Sliding window size to be used with the AOT scheduler will be
-        # populated on first build() call.
-        self.aot_sliding_window: Optional[tuple[int, int]] = None
-        self.total_tokens: int = 0
-
-    def build_for_cudagraph_capture(
-            self, common_attn_metadata: CommonAttentionMetadata):
-        self.total_tokens = self.model_config.max_model_len \
-            * self.vllm_config.scheduler_config.max_num_partial_prefills
-        res = self.build(common_prefix_len=0,
-                         common_attn_metadata=common_attn_metadata)
-        self.total_tokens = 0
-        return res
 
     def build(self,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
               fast_build: bool = False) -> 'AiterFlashAttentionMetadata':
-
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
-        max_seq_len = common_attn_metadata.max_seq_len
+
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
-        if max_query_len > 1:
-            # We pre-compute cumulative seq len needed for prefill attention
-            # here to avoid recomputing it for every layer
-            cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1,
-                                      dtype=torch.int32,
-                                      device=seq_lens.device)
-            torch.cumsum(seq_lens,
-                         dim=0,
-                         dtype=cu_seq_lens.dtype,
-                         out=cu_seq_lens[1:])
-            num_actual_kv_tokens = int(cu_seq_lens[-1].item())
-        else:
-            cu_seq_lens = None
-            num_actual_kv_tokens = 0
-
-        def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
-                     max_seq_len, causal):
-            return None
+        num_seqs = common_attn_metadata.num_reqs
+        max_seq_len = common_attn_metadata.max_seq_len
+        num_actual_kv_tokens = int(seq_lens.sum())
 
         use_cascade = common_prefix_len > 0
 
+        nbytes_per_qo_elem = torch.finfo(self.model_config.dtype).bits // 8
+        max_num_partitions = (max_seq_len + _PARTITION_SIZE_ROCM -
+                              1) // _PARTITION_SIZE_ROCM
+        empty_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+        k_buffer = None
+        v_buffer = None
+        workspace_buffer = None
+        cu_seq_lens = None
+        if max_query_len > 1:
+            required_memory = num_actual_kv_tokens * \
+                self.num_heads_kv * self.headdim * 2 * 2
+            if required_memory >= empty_gpu_memory:
+                raise ValueError(
+                    f"Not enough GPU memory to allocate k_buffer and v_buffer. "
+                    f"Required: {required_memory} bytes, "
+                    f"Available: {empty_gpu_memory} bytes, please reduce the "
+                    f"max_num_seqs or max_model_len.")
+            if not torch.cuda.graphs.is_current_stream_capturing():
+                k_buffer = torch.empty(
+                    (num_actual_kv_tokens, self.num_heads_kv, self.headdim),
+                    dtype=self.model_config.dtype,
+                    device=self.device,
+                )
+                v_buffer = torch.empty(
+                    (num_actual_kv_tokens, self.num_heads_kv, self.headdim),
+                    dtype=self.model_config.dtype,
+                    device=self.device,
+                )
+                cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1,
+                                          dtype=torch.int32,
+                                          device=self.device)
+                torch.cumsum(seq_lens,
+                             dim=0,
+                             dtype=cu_seq_lens.dtype,
+                             out=cu_seq_lens[1:])
+
+        workspace_buffer = torch.empty(
+            (num_seqs * self.num_heads_q * max_num_partitions * self.headdim) *
+            nbytes_per_qo_elem + 2 *
+            (num_seqs * self.num_heads_q * max_num_partitions) * 4,
+            dtype=torch.uint8,
+            device=self.device,
+        )
+
         attn_metadata = AiterFlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
             num_actual_kv_tokens=num_actual_kv_tokens,
@@ -305,10 +335,12 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
             seq_lens=seq_lens,
             block_table=block_table_tensor,
             slot_mapping=slot_mapping,
-            cu_seq_lens=cu_seq_lens,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
-            total_tokens=self.total_tokens,
+            k_buffer=k_buffer,
+            v_buffer=v_buffer,
+            workspace_buffer=workspace_buffer,
+            cu_seq_lens=cu_seq_lens,
         )
         return attn_metadata
 
@@ -381,6 +413,7 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[int] = None,
+        sinks: Optional[torch.Tensor] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -410,6 +443,9 @@ def __init__(
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
                                       "FlashAttentionImpl")
+        self.sinks = sinks
+        if self.sinks is not None:
+            raise NotImplementedError("Sinks are not supported for ROCM AITER")
 
     def forward(
         self,
@@ -491,6 +527,17 @@ def forward(
             block_table = attn_metadata.block_table
 
             if max_seqlen_q > 1:
+                if attn_metadata.cu_seq_lens is None:
+                    cu_seq_lens = torch.zeros(seqused_k.shape[0] + 1,
+                                              dtype=torch.int32,
+                                              device=query.device)
+                    torch.cumsum(seqused_k,
+                                 dim=0,
+                                 dtype=cu_seq_lens.dtype,
+                                 out=cu_seq_lens[1:])
+                else:
+                    cu_seq_lens = attn_metadata.cu_seq_lens
+
                 torch.ops.vllm.flash_attn_varlen_func(
                     query[:num_actual_tokens],
                     key_cache,
@@ -503,25 +550,29 @@ def forward(
                     alibi_slopes=self.alibi_slopes,
                     window_size=self.sliding_window,
                     block_table=block_table,
-                    cu_seqlens_k=attn_metadata.cu_seq_lens,
+                    cu_seqlens_k=cu_seq_lens,
                     k_scale=layer._k_scale,
                     v_scale=layer._v_scale,
                     total_tokens=attn_metadata.num_actual_kv_tokens,
+                    k_values=attn_metadata.k_buffer,
+                    v_values=attn_metadata.v_buffer,
                 )
-
-            _, num_heads, head_size = query.shape
-            nbytes_per_qo_elem = torch.finfo(query.dtype).bits // 8
-            num_seqs = seqused_k.shape[0]
-            max_num_partitions = (max_seqlen_k + _PARTITION_SIZE_ROCM -
-                                  1) // _PARTITION_SIZE_ROCM
-
-            workspace_buffer = torch.empty(
-                (num_seqs * num_heads * max_num_partitions * head_size) *
-                nbytes_per_qo_elem + 2 *
-                (num_seqs * num_heads * max_num_partitions) * 4,
-                dtype=torch.uint8,
-                device=output.device,
-            )
+            if attn_metadata.workspace_buffer is None:
+                _, num_heads, head_size = query.shape
+                nbytes_per_qo_elem = torch.finfo(query.dtype).bits // 8
+                num_seqs = seqused_k.shape[0]
+                max_num_partitions = (max_seqlen_k + _PARTITION_SIZE_ROCM -
+                                      1) // _PARTITION_SIZE_ROCM
+
+                workspace_buffer = torch.empty(
+                    (num_seqs * num_heads * max_num_partitions * head_size) *
+                    nbytes_per_qo_elem + 2 *
+                    (num_seqs * num_heads * max_num_partitions) * 4,
+                    dtype=torch.uint8,
+                    device=output.device,
+                )
+            else:
+                workspace_buffer = attn_metadata.workspace_buffer
 
             torch.ops.aiter.paged_attention_v1(
                 output[:num_actual_tokens],
@@ -543,6 +594,12 @@ def forward(
                 None,
                 _PARTITION_SIZE_ROCM,
             )
+            if workspace_buffer is not None:
+                workspace_buffer.zero_()
+            if attn_metadata.k_buffer is not None:
+                attn_metadata.k_buffer.zero_()
+            if attn_metadata.v_buffer is not None:
+                attn_metadata.v_buffer.zero_()
             return output
         else:
             raise NotImplementedError(