Skip to content

Commit 3f65494

Browse files
authored
[0.9.1][bugfix] fix configuration check logic about ascend scheduler (#2327)
### What this PR does / why we need it? Add onfiguration check logic for ascend scheduler: 1) if chunked_prefill is disabled, `max_num_batched_tokens` couldn't be less than `max_model_len`, following vLLM; 2) if ascend scheduler is disabled, mtp cannot be enabled. ### Does this PR introduce any user-facing change? 1) users cannot enable mtp without ascend scheduler 2) users cannot set `max_num_batched_tokens` smaller than `max_model_len` with ascend scheduler ### How was this patch tested? CI and vllm serving passed Signed-off-by: linfeng-yuan <[email protected]>
1 parent 19c7b3e commit 3f65494

File tree

4 files changed

+24
-1
lines changed

4 files changed

+24
-1
lines changed

tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,12 @@ def test_mtp_correctness(
7878
},
7979
max_model_len=256,
8080
gpu_memory_utilization=0.8,
81-
enforce_eager=True)
81+
enforce_eager=True,
82+
additional_config={
83+
"ascend_scheduler_config": {
84+
"enabled": True
85+
},
86+
})
8287
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
8388
matches = 0
8489
misses = 0

tests/singlecard/core/test_ascend_scheduler_e2e.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def model() -> LLM:
1818
MODEL,
1919
enforce_eager=True,
2020
enable_prefix_caching=True,
21+
max_model_len=200,
2122
max_num_batched_tokens=200,
2223
max_num_seqs=3,
2324
additional_config={"ascend_scheduler_config": {

vllm_ascend/ascend_config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,13 @@ def check_ascend_config(vllm_config, enforce_eager):
203203
"Ascend scheduler is only supported for V1 Engine.")
204204
# for v1 engine
205205
else:
206+
# TODO(yexiong): remove this verification after mtp model supports original vllm scheduler
207+
if (not ascend_config.ascend_scheduler_config.enabled
208+
and vllm_config.speculative_config
209+
and vllm_config.speculative_config.method == 'deepseek_mtp'):
210+
raise NotImplementedError(
211+
"Currently deepseek MTP model is only supported for ascend scheduler."
212+
)
206213
# for eager mode
207214
if enforce_eager:
208215
# torchair_graph cannot be enabled with eager mode.

vllm_ascend/core/schedule_config.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,16 @@ def __post_init__(self) -> None:
5555
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
5656
self.encoder_cache_size = self.max_num_batched_tokens
5757
self.chunked_prefill_enabled = self.enable_chunked_prefill
58+
if (self.max_num_batched_tokens < self.max_model_len
59+
and not self.chunked_prefill_enabled):
60+
raise ValueError(
61+
"Ascend scheduler is enabled without chunked prefill feature. "
62+
f"Argument max_num_batched_tokens ({self.max_num_batched_tokens}) is "
63+
f"smaller than max_model_len ({self.max_model_len}). "
64+
"This effectively limits the maximum sequence length to "
65+
"max_num_batched_tokens and makes vLLM reject longer "
66+
"sequences. Please increase max_num_batched_tokens or "
67+
"decrease max_model_len.")
5868
if self.policy != "fcfs":
5969
raise NotImplementedError(
6070
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"

0 commit comments

Comments
 (0)