[Attention] Remove unused reorder_batch method (#24463)

MatthewBonanni · web-flow · commit 4727a8afa795 · 2025-10-06T13:13:39.000-04:00
Signed-off-by: Matthew Bonanni &lt;mbonanni@redhat.com&gt;
diff --git a/tests/v1/logits_processors/test_correctness.py b/tests/v1/logits_processors/test_correctness.py
@@ -581,7 +581,7 @@ def _generate_fake_step_update(
     persistent_batch[:] = persistent_batch[0:condensed_batch_size]
 
     if condensed_batch_size > 1:
-        # Simulate arbitrary reorder_batch() in the kernel backend
+        # Simulate arbitrary batch ordering in the kernel backend
         # Generate a random number k of non-overlapping swap tuples
         k = random.randint(0, condensed_batch_size // 2)
         idxs = list(range(condensed_batch_size))
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -602,8 +602,7 @@ def build(
             )
         else:
             # Regular attention (common case).
-            # Decodes are at the front and prefills are at the back,
-            # according to reorder_batch()
+            # Decodes are at the front and prefills are at the back.
             num_prefills = attn_metadata.num_prefills
             num_decodes = attn_metadata.num_decodes
             if num_prefills > 0:
@@ -925,8 +924,7 @@ def forward(
         stride_order = FlashInferBackend.get_kv_cache_stride_order()
         kv_cache_permute = kv_cache.permute(*stride_order)
         # Regular attention (common case).
-        # Decodes are at the front and prefills are at the back,
-        # according to reorder_batch()
+        # Decodes are at the front and prefills are at the back.
         if num_prefill_tokens > 0:
             prefill_wrapper = attn_metadata.prefill_wrapper
             prefill_query = query[num_decode_tokens:]
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
@@ -3,7 +3,7 @@
 """Attention layer with FlexAttention."""
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional, Union
+from typing import Optional, Union
 
 import torch
 import torch._dynamo.decorators
@@ -38,10 +38,6 @@
 
 logger = init_logger(__name__)
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
-
 create_block_mask_compiled = torch.compile(
     create_block_mask, fullgraph=True, mode="reduce-overhead"
 )
@@ -600,11 +596,6 @@ def __init__(
         self.q_block_size: int = 16 if is_torch_equal_or_newer("2.9.0.dev0") else 128
         self.kv_block_size: int = 16 if is_torch_equal_or_newer("2.9.0.dev0") else 128
 
-    def reorder_batch(
-        self, input_batch: "InputBatch", scheduler_output: "SchedulerOutput"
-    ) -> bool:
-        return False
-
     def build(
         self,
         common_prefix_len: int,
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
@@ -4,10 +4,11 @@
 
 import ast
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import Optional
 
 import torch
 
+from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
@@ -20,17 +21,10 @@
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    reorder_batch_to_split_decodes_and_prefills,
     split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
-
-from vllm import _custom_ops as ops
-
 logger = init_logger(__name__)
 
 
@@ -189,12 +183,7 @@ def __init__(
             device=device,
         )
 
-    def reorder_batch(
-        self, input_batch: "InputBatch", scheduler_output: "SchedulerOutput"
-    ) -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(
-            input_batch, scheduler_output, decode_threshold=self.tree_attn_bias.shape[0]
-        )
+        self.reorder_batch_threshold = self.tree_attn_bias.shape[0]
 
     def build(
         self,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -299,24 +299,6 @@ def build(
         """
         raise NotImplementedError
 
-    def reorder_batch(
-        self, input_batch: "InputBatch", scheduler_output: "SchedulerOutput"
-    ) -> bool:
-        """
-        Update the order of requests in the batch based on the attention
-        backend's needs. For example, some attention backends (namely MLA) may
-        want to separate requests based on if the attention computation will be
-        compute-bound or memory-bound.
-
-        Args:
-            input_batch: input batch
-            scheduler_output: scheduler output.
-
-        Returns:
-            True if the batch was modified, False otherwise.
-        """
-        raise NotImplementedError
-
     def build_for_cudagraph_capture(
         self, common_attn_metadata: CommonAttentionMetadata
     ) -> M:
@@ -828,10 +810,6 @@ def reorder_batch_to_split_decodes_and_prefills(
 
     for i, req_id in enumerate(input_batch.req_ids):
         num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-        # for now treat 1 scheduled token as "decode" even if it's not,
-        # we should update this to something like < 8 in the future but
-        # currently the TritonMLA._forward_decode only supports
-        # num_tokens = 1
         if num_tokens <= decode_threshold:
             decodes.append(i)
             num_decode_tokens += num_tokens
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
@@ -3,7 +3,7 @@
 """Attention layer with XFormersAttention."""
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import Optional
 
 import torch
 
@@ -19,7 +19,6 @@
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    reorder_batch_to_split_decodes_and_prefills,
     split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
@@ -35,10 +34,6 @@
 except ImportError:
     XFORMERS_AVAILABLE = False
 
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
-
 from vllm import _custom_ops as ops
 
 logger = init_logger(__name__)
@@ -223,13 +218,6 @@ def __init__(
         self._num_decodes = 0
         self._num_decode_tokens = 0
 
-    def reorder_batch(
-        self, input_batch: "InputBatch", scheduler_output: "SchedulerOutput"
-    ) -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(
-            input_batch, scheduler_output, decode_threshold=self.reorder_batch_threshold
-        )
-
     def build(
         self,
         common_prefix_len: int,