@@ -395,12 +395,12 @@ def _schedule_running(
395
395
# We can have up to 1 running prefill at any given time in running
396
396
# queue, which means we can guarantee chunk size is at least 1.
397
397
assert num_running_tokens != 0
398
- num_running_seqs = seq_group .get_max_num_running_seqs ()
399
398
400
399
running_queue .popleft ()
401
400
while not self ._can_append_slots (seq_group ):
402
401
budget .subtract_num_batched_tokens (seq_group .request_id ,
403
402
num_running_tokens )
403
+ num_running_seqs = seq_group .get_max_num_running_seqs ()
404
404
budget .subtract_num_seqs (seq_group .request_id ,
405
405
num_running_seqs )
406
406
if curr_loras is not None and seq_group .lora_int_id > 0 :
@@ -439,7 +439,13 @@ def _schedule_running(
439
439
token_chunk_size = 1 ))
440
440
budget .add_num_batched_tokens (seq_group .request_id ,
441
441
num_running_tokens )
442
- budget .add_num_seqs (seq_group .request_id , num_running_seqs )
442
+ # OPTIMIZATION: Note that get_max_num_running_seqs is
443
+ # expensive. For the default scheduling chase where
444
+ # enable_chunking is False, num_seqs are updated before running
445
+ # this method, so we don't have to update it again here.
446
+ if enable_chunking :
447
+ num_running_seqs = seq_group .get_max_num_running_seqs ()
448
+ budget .add_num_seqs (seq_group .request_id , num_running_seqs )
443
449
if curr_loras is not None and seq_group .lora_int_id > 0 :
444
450
curr_loras .add (seq_group .lora_int_id )
445
451
0 commit comments