updated logic for attn selection with default split attn and increased size for CAR

Aleksandr Malyshev · Aleksandr Malyshev · commit 78aa33e29ea6 · 2025-09-09T19:50:12.000Z
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -784,7 +784,7 @@ def forward(
                             attn_masks[0][None]
                             if attn_masks is not None else None,
                             full_scales,
-                            layer._out_scale,
+                            output_scale,
                         )
                     else:
                         output[:num_prefill_tokens] = self.triton_attn_func(
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -54,7 +54,7 @@ class CustomAllreduce:
     def __init__(self,
                  group: ProcessGroup,
                  device: Union[int, str, torch.device],
-                 max_size=2 * 8192 * 1024) -> None:
+                 max_size=8 * 8192 * 1024) -> None:
         """
         Args:
             group: the process group to work on. If None, it will use the
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -325,6 +325,7 @@ def build(self,
             dtype=torch.uint8,
             device=self.device,
         )
+        if max_query_len > 1:
             # We pre-compute cumulative seq len needed for prefill attention
             # here to avoid recomputing it for every layer
             cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1,
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
@@ -30,7 +30,9 @@
 if current_platform.is_rocm():
     VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE = envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE
     if VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE:
-        from aiter.ops.triton.fused_kv_cache import fused_qk_rope_reshape_and_cache
+        from aiter.ops.triton.fused_kv_cache import (
+            fused_qk_rope_reshape_and_cache)
+
 
 @dataclass
 class TritonAttentionMetadata:
@@ -250,23 +252,24 @@ def __init__(
                                       "TritonAttentionImpl")
 
         self.fp8_dtype = current_platform.fp8_dtype()
-        self.force_prefill_decode_attn = \
-            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
 
         # If not using prefill decode attention, we use the Triton
         # unified attention implementation.
         if use_aiter_unified_attention():
             logger.info_once(
                 "Using aiter unified attention for TritonAttentionImpl")
-            from aiter.ops.triton.unified_attention import (
-                unified_attention)
+            from aiter.ops.triton.unified_attention import unified_attention
             self.unified_attention = unified_attention
-        elif not self.force_prefill_decode_attn:
+        elif not envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
             logger.info_once(
                 "Using vllm unified attention for TritonAttentionImpl")
             from vllm.attention.ops.triton_unified_attention import (
                 unified_attention)
             self.unified_attention = unified_attention
+        else:
+            logger.info_once(
+                "Using vllm split prefill decode attention for TritonAttentionImpl"
+            )
 
         self.sinks = sinks
         if sinks is not None:
@@ -324,30 +327,45 @@ def forward(
         # Whenever making a change in this method, please benchmark the
         # performance to make sure it does not introduce any overhead.
 
-        use_prefill_decode_attn = self.force_prefill_decode_attn
+        use_prefill_decode_attn = envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION \
+            and not use_aiter_unified_attention()
         num_actual_tokens = attn_metadata.num_actual_tokens
 
         if use_prefill_decode_attn:
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
         else:
             key_cache, value_cache = kv_cache.unbind(0)
-        
+
         if VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE:
-            assert self.kv_sharing_target_layer_name is None, "self.kv_sharing_target_layer_name error"      
-            cos, sin = cos_sin_cache.chunk(2, dim = -1)
+            assert self.kv_sharing_target_layer_name is None, "self.kv_sharing_target_layer_name error"
+            cos, sin = cos_sin_cache.chunk(2, dim=-1)
             is_fp8_kv_cache = self.kv_cache_dtype.startswith("fp8")
             if is_fp8_kv_cache:
                 key_cache_og_dtype = key_cache.dtype
                 value_cache_og_dtype = value_cache.dtype
                 key_cache = key_cache.view(self.fp8_dtype)
                 value_cache = value_cache.view(self.fp8_dtype)
             query, key, key_cache, value_cache, output = fused_qk_rope_reshape_and_cache(
-                query, key, value, key_cache, value_cache, attn_metadata.slot_mapping, 
-                positions, cos, sin, 
-                layer._k_scale, layer._v_scale,
-                is_neox, 
-                flash_layout=(not use_prefill_decode_attn), apply_scale=is_fp8_kv_cache, offs=None, q_out=query, k_out=key, output_zeros=True, zeros_out=output)
+                query,
+                key,
+                value,
+                key_cache,
+                value_cache,
+                attn_metadata.slot_mapping,
+                positions,
+                cos,
+                sin,
+                layer._k_scale,
+                layer._v_scale,
+                is_neox,
+                flash_layout=(not use_prefill_decode_attn),
+                apply_scale=is_fp8_kv_cache,
+                offs=None,
+                q_out=query,
+                k_out=key,
+                output_zeros=True,
+                zeros_out=output)
             if is_fp8_kv_cache:
                 key_cache = key_cache.view(key_cache_og_dtype)
                 value_cache = value_cache.view(value_cache_og_dtype)

Original file line number	Diff line number	Diff line change
`@@ -784,7 +784,7 @@ def forward(`
`784`	`784`	`attn_masks[0][None]`
`785`	`785`	`if attn_masks is not None else None,`
`786`	`786`	`full_scales,`
`787`		`- layer._out_scale,`
	`787`	`+ output_scale,`
`788`	`788`	`)`
`789`	`789`	`else:`
`790`	`790`	`output[:num_prefill_tokens] = self.triton_attn_func(`
Original file line number	Diff line number	Diff line change
`@@ -325,6 +325,7 @@ def build(self,`
`325`	`325`	`dtype=torch.uint8,`
`326`	`326`	`device=self.device,`
`327`	`327`	`)`
	`328`	`+ if max_query_len > 1:`
`328`	`329`	`# We pre-compute cumulative seq len needed for prefill attention`
`329`	`330`	`# here to avoid recomputing it for every layer`
`330`	`331`	`cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1,`