Skip to content

Commit 3ca22f7

Browse files
committed
[FEAT] Resolve Conflicts
Signed-off-by: Csrayz <[email protected]>
1 parent 98c6822 commit 3ca22f7

File tree

2 files changed

+26
-1
lines changed

2 files changed

+26
-1
lines changed

vllm_ascend/core/schedule_config.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,16 @@
1616
#
1717

1818
from dataclasses import dataclass, fields
19-
from typing import Type, Union
19+
from typing import Optional, Type, Union
2020

2121
from vllm.config import SchedulerConfig
2222

2323

2424
@dataclass
2525
class AscendSchedulerConfig(SchedulerConfig):
2626
enable_chunked_prefill: bool = False
27+
max_long_partial_prefills: Optional[Union[int, float]] = None
28+
long_prefill_token_threshold: Optional[Union[int, float]] = None
2729
policy: str = "fcfs"
2830
num_scheduler_steps: int = 1
2931
scheduler_cls: Union[str, Type[object]] = (
@@ -41,6 +43,8 @@ def initialize_from_config(
4143
}
4244
# Override default values into original SchedulerConfig
4345
scheduler_config["enable_chunked_prefill"] = False
46+
scheduler_config["max_long_partial_prefills"] = None
47+
scheduler_config["long_prefill_token_threshold"] = None
4448
scheduler_config["policy"] = "fcfs"
4549
scheduler_config["num_scheduler_steps"] = 1
4650
scheduler_config["scheduler_cls"] = (
@@ -65,6 +69,17 @@ def __post_init__(self) -> None:
6569
"max_num_batched_tokens and makes vLLM reject longer "
6670
"sequences. Please increase max_num_batched_tokens or "
6771
"decrease max_model_len.")
72+
# concurrent partial prefills. Default is inf
73+
if self.max_long_partial_prefills is None:
74+
self.max_long_partial_prefills = float('inf')
75+
self.long_prefill_token_threshold = float('inf')
76+
else:
77+
if self.long_prefill_token_threshold is None:
78+
self.long_prefill_token_threshold = \
79+
int(self.max_model_len * 0.04)
80+
81+
assert (self.max_long_partial_prefills > 0)
82+
assert (self.long_prefill_token_threshold > 0)
6883
if self.policy != "fcfs":
6984
raise NotImplementedError(
7085
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"

vllm_ascend/core/scheduler.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,10 @@ def schedule(self) -> SchedulerOutput:
8585
# and put back at the head of the waiting queue later
8686
skipped_waiting_requests: deque[Request] = deque()
8787

88+
# Skip long prompt requests in prefill stage.
89+
# long_prefill_budget is float('inf') if not use.
90+
long_prefill_budget = self.vllm_config.scheduler_config.max_long_partial_prefills
91+
8892
# Schedule prefill requests first.
8993
while self.waiting and token_budget > 0:
9094
if len(self.running) == self.max_num_running_reqs:
@@ -183,6 +187,11 @@ def skip_cur_request():
183187
skip_cur_request()
184188
continue
185189

190+
if num_new_tokens > self.vllm_config.scheduler_config.long_prefill_token_threshold \
191+
and long_prefill_budget <= 0:
192+
skip_cur_request()
193+
continue
194+
186195
new_blocks = self.kv_cache_manager.allocate_slots(
187196
request,
188197
num_new_tokens + num_external_computed_tokens,
@@ -235,6 +244,7 @@ def skip_cur_request():
235244
# Update request info.
236245
num_scheduled_tokens[request.request_id] = num_new_tokens
237246
token_budget -= num_new_tokens
247+
long_prefill_budget -= 1
238248
request.status = RequestStatus.RUNNING
239249
request.num_computed_tokens = num_computed_tokens
240250
# Count the number of prefix cached tokens.

0 commit comments

Comments
 (0)