@@ -299,24 +299,6 @@ def build(
299
299
"""
300
300
raise NotImplementedError
301
301
302
- def reorder_batch (
303
- self , input_batch : "InputBatch" , scheduler_output : "SchedulerOutput"
304
- ) -> bool :
305
- """
306
- Update the order of requests in the batch based on the attention
307
- backend's needs. For example, some attention backends (namely MLA) may
308
- want to separate requests based on if the attention computation will be
309
- compute-bound or memory-bound.
310
-
311
- Args:
312
- input_batch: input batch
313
- scheduler_output: scheduler output.
314
-
315
- Returns:
316
- True if the batch was modified, False otherwise.
317
- """
318
- raise NotImplementedError
319
-
320
302
def build_for_cudagraph_capture (
321
303
self , common_attn_metadata : CommonAttentionMetadata
322
304
) -> M :
@@ -828,10 +810,6 @@ def reorder_batch_to_split_decodes_and_prefills(
828
810
829
811
for i , req_id in enumerate (input_batch .req_ids ):
830
812
num_tokens = scheduler_output .num_scheduled_tokens [req_id ]
831
- # for now treat 1 scheduled token as "decode" even if it's not,
832
- # we should update this to something like < 8 in the future but
833
- # currently the TritonMLA._forward_decode only supports
834
- # num_tokens = 1
835
813
if num_tokens <= decode_threshold :
836
814
decodes .append (i )
837
815
num_decode_tokens += num_tokens
0 commit comments