Skip to content

Commit 410fe43

Browse files
committed
[FEAT] add Concurrent Partial Prefills
Implement Concurrent Partial Prefills Signed-off-by: Csrayz <[email protected]>
1 parent 4604882 commit 410fe43

File tree

2 files changed

+26
-1
lines changed

2 files changed

+26
-1
lines changed

vllm_ascend/core/schedule_config.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,16 @@
1616
#
1717

1818
from dataclasses import dataclass, fields
19-
from typing import Type, Union
19+
from typing import Optional, Type, Union
2020

2121
from vllm.config import SchedulerConfig
2222

2323

2424
@dataclass
2525
class AscendSchedulerConfig(SchedulerConfig):
2626
enable_chunked_prefill: bool = False
27+
max_long_partial_prefills: Optional[Union[int, float]] = None
28+
long_prefill_token_threshold: Optional[Union[int, float]] = None
2729
policy: str = "fcfs"
2830
num_scheduler_steps: int = 1
2931
scheduler_cls: Union[str, Type[object]] = (
@@ -41,6 +43,8 @@ def initialize_from_config(
4143
}
4244
# Override default values into original SchedulerConfig
4345
scheduler_config["enable_chunked_prefill"] = False
46+
scheduler_config["max_long_partial_prefills"] = None
47+
scheduler_config["long_prefill_token_threshold"] = None
4448
scheduler_config["policy"] = "fcfs"
4549
scheduler_config["num_scheduler_steps"] = 1
4650
scheduler_config["scheduler_cls"] = (
@@ -55,6 +59,17 @@ def __post_init__(self) -> None:
5559
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
5660
self.encoder_cache_size = self.max_num_batched_tokens
5761
self.chunked_prefill_enabled = self.enable_chunked_prefill
62+
# concurrent partial prefills. Default is inf
63+
if self.max_long_partial_prefills is None:
64+
self.max_long_partial_prefills = float('inf')
65+
self.long_prefill_token_threshold = float('inf')
66+
else:
67+
if self.long_prefill_token_threshold is None:
68+
self.long_prefill_token_threshold = \
69+
int(self.max_model_len * 0.04)
70+
71+
assert (self.max_long_partial_prefills > 0)
72+
assert (self.long_prefill_token_threshold > 0)
5873
if self.policy != "fcfs":
5974
raise NotImplementedError(
6075
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"

vllm_ascend/core/scheduler.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ def schedule(self) -> SchedulerOutput:
7575
# and put back at the head of the waiting queue later
7676
skipped_waiting_requests: deque[Request] = deque()
7777

78+
# Skip long prompt requests in prefill stage.
79+
# long_prefill_budget is float('inf') if not use.
80+
long_prefill_budget = self.vllm_config.scheduler_config.max_long_partial_prefills
81+
7882
# Schedule prefill requests first.
7983
while self.waiting and token_budget > 0:
8084
if len(self.running) == self.max_num_running_reqs:
@@ -173,6 +177,11 @@ def skip_cur_request():
173177
skip_cur_request()
174178
continue
175179

180+
if num_new_tokens > self.vllm_config.scheduler_config.long_prefill_token_threshold \
181+
and long_prefill_budget <= 0:
182+
skip_cur_request()
183+
continue
184+
176185
new_blocks = self.kv_cache_manager.allocate_slots(
177186
request,
178187
num_new_tokens + num_external_computed_tokens,
@@ -222,6 +231,7 @@ def skip_cur_request():
222231
# Update request info.
223232
num_scheduled_tokens[request.request_id] = num_new_tokens
224233
token_budget -= num_new_tokens
234+
long_prefill_budget -= 1
225235
request.status = RequestStatus.RUNNING
226236
request.num_computed_tokens = num_computed_tokens
227237
# Count the number of prefix cached tokens.

0 commit comments

Comments
 (0)