Skip to content

Commit ffee8b2

Browse files
committed
modify default values to fix bug
Signed-off-by: whx-sjtu <2952154980@qq.com>
1 parent 1ff8395 commit ffee8b2

File tree

3 files changed

+22
-26
lines changed

3 files changed

+22
-26
lines changed

docs/source/user_guide/configuration/additional_config.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ The details of each config option are as follows:
6767
| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
6868
| `enable_pd_transfer` | bool | `False` | Whether to enable pd transfer. When using it, decode is started only when prefill of all requests is done. This option only takes effects on offline inference. |
6969
| `decode_max_num_seqs` | int | `0` | Whether to change max_num_seqs of decode phase when enable pd transfer. This option only takes effects when enable_pd_transfer is True. |
70-
| `max_long_partial_prefill_tokens` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
71-
| `ascend_long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. |
70+
| `max_long_partial_prefills` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
71+
| `long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. |
7272

7373
ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well.
7474

vllm_ascend/core/schedule_config.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@
2626
@dataclass
2727
class AscendSchedulerConfig(SchedulerConfig):
2828
enable_chunked_prefill: bool = False
29-
max_long_partial_prefill_tokens: int = MAX_INT
30-
ascend_long_prefill_token_threshold: int = MAX_INT
29+
max_long_partial_prefills: int = 1
30+
long_prefill_token_threshold: int = 0
3131
policy: str = "fcfs"
3232
scheduler_cls: Union[str, Type[object]] = (
3333
"vllm_ascend.core.scheduler.AscendScheduler")
@@ -71,27 +71,27 @@ def __post_init__(self) -> None:
7171
"max_num_batched_tokens and makes vLLM reject longer "
7272
"sequences. Please increase max_num_batched_tokens or "
7373
"decrease max_model_len.")
74-
# concurrent partial prefills. Default is inf
75-
if self.max_long_partial_prefill_tokens is None:
76-
self.max_long_partial_prefill_tokens = MAX_INT
77-
self.ascend_long_prefill_token_threshold = MAX_INT
74+
# concurrent partial prefills. Default is 1 meaning not enabled.
75+
if self.max_long_partial_prefills is None:
76+
self.max_long_partial_prefills = 1
77+
self.long_prefill_token_threshold = 0
7878

79-
if self.ascend_long_prefill_token_threshold is None or \
80-
self.ascend_long_prefill_token_threshold <= 0:
79+
if self.long_prefill_token_threshold is None or \
80+
self.long_prefill_token_threshold <= 0:
8181
if self.max_model_len is None:
82-
self.ascend_long_prefill_token_threshold = MAX_INT
82+
self.long_prefill_token_threshold = MAX_INT
8383
else:
84-
self.ascend_long_prefill_token_threshold = \
84+
self.long_prefill_token_threshold = \
8585
max(1, int(self.max_model_len * 0.04))
8686

87-
if self.max_long_partial_prefill_tokens < 0:
87+
if self.max_long_partial_prefills < 0:
8888
raise ValueError(
89-
f"max_long_partial_prefill_tokens must be non-negative, but got "
90-
f"{self.max_long_partial_prefill_tokens}")
91-
if self.ascend_long_prefill_token_threshold < 0:
89+
f"max_long_partial_prefills must be non-negative, but got "
90+
f"{self.max_long_partial_prefills}")
91+
if self.long_prefill_token_threshold < 0:
9292
raise ValueError(
93-
f"ascend_long_prefill_token_threshold must be non-negative, but got "
94-
f"{self.ascend_long_prefill_token_threshold}")
93+
f"long_prefill_token_threshold must be non-negative, but got "
94+
f"{self.long_prefill_token_threshold}")
9595

9696
if self.policy != "fcfs":
9797
raise NotImplementedError(
@@ -103,4 +103,4 @@ def __post_init__(self) -> None:
103103
if getattr(self, "scheduler_delay_factor", 0) > 0:
104104
raise NotImplementedError(
105105
"currently AscendScheduler doesn't support scheduler_delay_factor."
106-
)
106+
)

vllm_ascend/core/scheduler.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,6 @@
3232
from vllm.v1.request import Request, RequestStatus
3333
from vllm.v1.structured_output import StructuredOutputManager
3434

35-
from vllm_ascend.ascend_config import get_ascend_config
36-
from vllm_ascend.core.schedule_config import MAX_INT
37-
3835

3936
class AscendScheduler(Scheduler):
4037
"""This Scheduler extends vllm's original v1 scheduler
@@ -63,7 +60,6 @@ def __init__(
6360
self.phase = "" if not enable_pd_transfer else "prefill"
6461
self.decode_max_num_running_reqs = max(self.max_num_running_reqs,
6562
decode_max_num_seqs)
66-
self.ascend_config = get_ascend_config()
6763

6864
def schedule(self) -> SchedulerOutput:
6965
if self.scheduler_config.chunked_prefill_enabled:
@@ -108,12 +104,12 @@ def schedule(self) -> SchedulerOutput:
108104
self.phase = "decode"
109105
# Skip long prompt requests in prefill stage.
110106
# long_prefill_budget is float('inf') if not use.
111-
if self.ascend_config.ascend_scheduler_config.max_long_partial_prefill_tokens == MAX_INT:
107+
if self.vllm_config.scheduler_config.long_prefill_token_threshold == 0:
112108
long_prefill_budget = float('inf')
113109
long_prefill_token_threshold = float('inf')
114110
else:
115-
long_prefill_budget = self.ascend_config.ascend_scheduler_config.max_long_partial_prefill_tokens
116-
long_prefill_token_threshold = self.ascend_config.ascend_scheduler_config.ascend_long_prefill_token_threshold
111+
long_prefill_budget = self.vllm_config.scheduler_config.max_long_partial_prefills
112+
long_prefill_token_threshold = self.vllm_config.scheduler_config.long_prefill_token_threshold
117113

118114
# Schedule prefill requests first.
119115
while self.waiting and token_budget > 0:

0 commit comments

Comments
 (0)