Skip to content

Commit 8e0adbe

Browse files
committed
[Fix] fix some error
Signed-off-by: Csrayz <[email protected]>
1 parent 3662c43 commit 8e0adbe

File tree

3 files changed

+6
-4
lines changed

3 files changed

+6
-4
lines changed

docs/source/user_guide/configuration/additional_config.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ The details of each config option are as follows:
5454
| ---- | ---- | ------- | ----------- |
5555
| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
5656
| `max_long_partial_prefills` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
57-
| `long_prefill_token_threshold` | Union[int, float] | `False` | a request is considered long if the prompt is longer than this number of tokens. |
57+
| `long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. |
5858

5959
ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well.
6060

vllm_ascend/core/schedule_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def __post_init__(self) -> None:
6666
else:
6767
if self.long_prefill_token_threshold is None:
6868
self.long_prefill_token_threshold = \
69-
int(self.max_model_len * 0.04)
69+
max(1, int(self.max_model_len * 0.04))
7070

7171
assert (self.max_long_partial_prefills > 0)
7272
assert (self.long_prefill_token_threshold > 0)

vllm_ascend/core/scheduler.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def schedule(self) -> SchedulerOutput:
7878
# Skip long prompt requests in prefill stage.
7979
# long_prefill_budget is float('inf') if not use.
8080
long_prefill_budget = self.vllm_config.scheduler_config.max_long_partial_prefills
81+
long_prefill_token_threshold = self.vllm_config.scheduler_config.long_prefill_token_threshold
8182

8283
# Schedule prefill requests first.
8384
while self.waiting and token_budget > 0:
@@ -177,7 +178,7 @@ def skip_cur_request():
177178
skip_cur_request()
178179
continue
179180

180-
if num_new_tokens > self.vllm_config.scheduler_config.long_prefill_token_threshold \
181+
if num_new_tokens > long_prefill_token_threshold \
181182
and long_prefill_budget <= 0:
182183
skip_cur_request()
183184
continue
@@ -231,7 +232,8 @@ def skip_cur_request():
231232
# Update request info.
232233
num_scheduled_tokens[request.request_id] = num_new_tokens
233234
token_budget -= num_new_tokens
234-
long_prefill_budget -= 1
235+
if num_new_tokens > long_prefill_token_threshold:
236+
long_prefill_budget -= 1
235237
request.status = RequestStatus.RUNNING
236238
request.num_computed_tokens = num_computed_tokens
237239
# Count the number of prefix cached tokens.

0 commit comments

Comments
 (0)