Skip to content

Commit 050f285

Browse files
authored
[Core] Scheduling optimization 2 (#4280)
1 parent 8f2ea22 commit 050f285

File tree

3 files changed

+15
-3
lines changed

3 files changed

+15
-3
lines changed

tests/core/test_scheduler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,8 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
563563
assert len(output.preempted) == 2
564564
# Verify budgets are updated.
565565
assert budget.num_batched_tokens == 1
566-
assert budget.num_curr_seqs == 1
566+
# NOTE: When enable_chunk is False, num_seqs budget is not updated.
567+
# assert budget.num_curr_seqs == 1
567568
# Both should be preempted, not swapped.
568569
assert output.blocks_to_swap_out == {}
569570
# Nothing is copied.

vllm/core/scheduler.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -395,12 +395,12 @@ def _schedule_running(
395395
# We can have up to 1 running prefill at any given time in running
396396
# queue, which means we can guarantee chunk size is at least 1.
397397
assert num_running_tokens != 0
398-
num_running_seqs = seq_group.get_max_num_running_seqs()
399398

400399
running_queue.popleft()
401400
while not self._can_append_slots(seq_group):
402401
budget.subtract_num_batched_tokens(seq_group.request_id,
403402
num_running_tokens)
403+
num_running_seqs = seq_group.get_max_num_running_seqs()
404404
budget.subtract_num_seqs(seq_group.request_id,
405405
num_running_seqs)
406406
if curr_loras is not None and seq_group.lora_int_id > 0:
@@ -439,7 +439,13 @@ def _schedule_running(
439439
token_chunk_size=1))
440440
budget.add_num_batched_tokens(seq_group.request_id,
441441
num_running_tokens)
442-
budget.add_num_seqs(seq_group.request_id, num_running_seqs)
442+
# OPTIMIZATION: Note that get_max_num_running_seqs is
443+
# expensive. For the default scheduling chase where
444+
# enable_chunking is False, num_seqs are updated before running
445+
# this method, so we don't have to update it again here.
446+
if enable_chunking:
447+
num_running_seqs = seq_group.get_max_num_running_seqs()
448+
budget.add_num_seqs(seq_group.request_id, num_running_seqs)
443449
if curr_loras is not None and seq_group.lora_int_id > 0:
444450
curr_loras.add(seq_group.lora_int_id)
445451

vllm/sequence.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,11 @@ def get_num_uncomputed_tokens(self) -> int:
508508
return num_uncomputed_tokens
509509

510510
def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
511+
# Optimization. We don't need to call get_seqs if we don't need to
512+
# filter by states.
513+
if status is None:
514+
return len(self.seqs_dict)
515+
511516
return len(self.get_seqs(status))
512517

513518
def num_unfinished_seqs(self) -> int:

0 commit comments

Comments
 (0)