Skip to content

Commit 86263c5

Browse files
committed
modify default values to fix bug
Signed-off-by: whx-sjtu <2952154980@qq.com>
1 parent e8ba37b commit 86263c5

File tree

3 files changed

+21
-26
lines changed

3 files changed

+21
-26
lines changed

docs/source/user_guide/configuration/additional_config.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ The details of each config option are as follows:
6767
| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
6868
| `enable_pd_transfer` | bool | `False` | Whether to enable pd transfer. When using it, decode is started only when prefill of all requests is done. This option only takes effects on offline inference. |
6969
| `decode_max_num_seqs` | int | `0` | Whether to change max_num_seqs of decode phase when enable pd transfer. This option only takes effects when enable_pd_transfer is True. |
70-
| `max_long_partial_prefill_tokens` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
71-
| `ascend_long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. |
70+
| `max_long_partial_prefills` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
71+
| `long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. |
7272

7373
ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well.
7474

vllm_ascend/core/schedule_config.py

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,13 @@
2020

2121
from vllm.config import SchedulerConfig
2222

23-
MAX_INT = 2147483647
2423

2524

2625
@dataclass
2726
class AscendSchedulerConfig(SchedulerConfig):
2827
enable_chunked_prefill: bool = False
29-
max_long_partial_prefill_tokens: int = MAX_INT
30-
ascend_long_prefill_token_threshold: int = MAX_INT
28+
max_long_partial_prefills: int = 1
29+
long_prefill_token_threshold: int = 0
3130
policy: str = "fcfs"
3231
scheduler_cls: Union[str, Type[object]] = (
3332
"vllm_ascend.core.scheduler.AscendScheduler")
@@ -71,27 +70,27 @@ def __post_init__(self) -> None:
7170
"max_num_batched_tokens and makes vLLM reject longer "
7271
"sequences. Please increase max_num_batched_tokens or "
7372
"decrease max_model_len.")
74-
# concurrent partial prefills. Default is inf
75-
if self.max_long_partial_prefill_tokens is None:
76-
self.max_long_partial_prefill_tokens = MAX_INT
77-
self.ascend_long_prefill_token_threshold = MAX_INT
73+
# concurrent partial prefills. Default is 1 meaning not enabled.
74+
if self.max_long_partial_prefills is None:
75+
self.max_long_partial_prefills = 1
76+
self.long_prefill_token_threshold = 0
7877

79-
if self.ascend_long_prefill_token_threshold is None or \
80-
self.ascend_long_prefill_token_threshold <= 0:
78+
if self.long_prefill_token_threshold is None or \
79+
self.long_prefill_token_threshold <= 0:
8180
if self.max_model_len is None:
82-
self.ascend_long_prefill_token_threshold = MAX_INT
81+
self.long_prefill_token_threshold = 0
8382
else:
84-
self.ascend_long_prefill_token_threshold = \
83+
self.long_prefill_token_threshold = \
8584
max(1, int(self.max_model_len * 0.04))
8685

87-
if self.max_long_partial_prefill_tokens < 0:
86+
if self.max_long_partial_prefills < 0:
8887
raise ValueError(
89-
f"max_long_partial_prefill_tokens must be non-negative, but got "
90-
f"{self.max_long_partial_prefill_tokens}")
91-
if self.ascend_long_prefill_token_threshold < 0:
88+
f"max_long_partial_prefills must be non-negative, but got "
89+
f"{self.max_long_partial_prefills}")
90+
if self.long_prefill_token_threshold < 0:
9291
raise ValueError(
93-
f"ascend_long_prefill_token_threshold must be non-negative, but got "
94-
f"{self.ascend_long_prefill_token_threshold}")
92+
f"long_prefill_token_threshold must be non-negative, but got "
93+
f"{self.long_prefill_token_threshold}")
9594

9695
if self.policy != "fcfs":
9796
raise NotImplementedError(

vllm_ascend/core/scheduler.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,6 @@
3232
from vllm.v1.request import Request, RequestStatus
3333
from vllm.v1.structured_output import StructuredOutputManager
3434

35-
from vllm_ascend.ascend_config import get_ascend_config
36-
from vllm_ascend.core.schedule_config import MAX_INT
37-
3835

3936
class AscendScheduler(Scheduler):
4037
"""This Scheduler extends vllm's original v1 scheduler
@@ -63,7 +60,6 @@ def __init__(
6360
self.phase = "" if not enable_pd_transfer else "prefill"
6461
self.decode_max_num_running_reqs = max(self.max_num_running_reqs,
6562
decode_max_num_seqs)
66-
self.ascend_config = get_ascend_config()
6763

6864
def schedule(self) -> SchedulerOutput:
6965
if self.scheduler_config.chunked_prefill_enabled:
@@ -108,12 +104,12 @@ def schedule(self) -> SchedulerOutput:
108104
self.phase = "decode"
109105
# Skip long prompt requests in prefill stage.
110106
# long_prefill_budget is float('inf') if not use.
111-
if self.ascend_config.ascend_scheduler_config.max_long_partial_prefill_tokens == MAX_INT:
107+
if self.vllm_config.scheduler_config.long_prefill_token_threshold == 0:
112108
long_prefill_budget = float('inf')
113109
long_prefill_token_threshold = float('inf')
114110
else:
115-
long_prefill_budget = self.ascend_config.ascend_scheduler_config.max_long_partial_prefill_tokens
116-
long_prefill_token_threshold = self.ascend_config.ascend_scheduler_config.ascend_long_prefill_token_threshold
111+
long_prefill_budget = self.vllm_config.scheduler_config.max_long_partial_prefills
112+
long_prefill_token_threshold = self.vllm_config.scheduler_config.long_prefill_token_threshold
117113

118114
# Schedule prefill requests first.
119115
while self.waiting and token_budget > 0:

0 commit comments

Comments
 (0)