[0.9.1][bugfix] fix configuration check logic about ascend scheduler (#2327)

linfeng-yuan · web-flow · commit 3f65494fb3f5 · 2025-08-13T20:40:55.000+08:00
### What this PR does / why we need it?
Add onfiguration check logic for ascend scheduler: 1) if chunked_prefill
is disabled, `max_num_batched_tokens` couldn't be less than
`max_model_len`, following vLLM; 2) if ascend scheduler is disabled, mtp
cannot be enabled.

### Does this PR introduce any user-facing change?
1) users cannot enable mtp without ascend scheduler
2) users cannot set `max_num_batched_tokens` smaller than
`max_model_len` with ascend scheduler

### How was this patch tested?
CI and vllm serving passed

Signed-off-by: linfeng-yuan &lt;1102311262@qq.com&gt;
diff --git a/tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py b/tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
@@ -78,7 +78,12 @@ def test_mtp_correctness(
                        },
                        max_model_len=256,
                        gpu_memory_utilization=0.8,
-                       enforce_eager=True)
+                       enforce_eager=True,
+                       additional_config={
+                           "ascend_scheduler_config": {
+                               "enabled": True
+                           },
+                       })
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
         misses = 0
diff --git a/tests/singlecard/core/test_ascend_scheduler_e2e.py b/tests/singlecard/core/test_ascend_scheduler_e2e.py
@@ -18,6 +18,7 @@ def model() -> LLM:
         MODEL,
         enforce_eager=True,
         enable_prefix_caching=True,
+        max_model_len=200,
         max_num_batched_tokens=200,
         max_num_seqs=3,
         additional_config={"ascend_scheduler_config": {
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -203,6 +203,13 @@ def check_ascend_config(vllm_config, enforce_eager):
                 "Ascend scheduler is only supported for V1 Engine.")
     # for v1 engine
     else:
+        # TODO(yexiong): remove this verification after mtp model supports original vllm scheduler
+        if (not ascend_config.ascend_scheduler_config.enabled
+                and vllm_config.speculative_config
+                and vllm_config.speculative_config.method == 'deepseek_mtp'):
+            raise NotImplementedError(
+                "Currently deepseek MTP model is only supported for ascend scheduler."
+            )
         # for eager mode
         if enforce_eager:
             # torchair_graph cannot be enabled with eager mode.
diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py
@@ -55,6 +55,16 @@ def __post_init__(self) -> None:
         self.max_num_encoder_input_tokens = self.max_num_batched_tokens
         self.encoder_cache_size = self.max_num_batched_tokens
         self.chunked_prefill_enabled = self.enable_chunked_prefill
+        if (self.max_num_batched_tokens < self.max_model_len
+                and not self.chunked_prefill_enabled):
+            raise ValueError(
+                "Ascend scheduler is enabled without chunked prefill feature. "
+                f"Argument max_num_batched_tokens ({self.max_num_batched_tokens}) is "
+                f"smaller than max_model_len ({self.max_model_len}). "
+                "This effectively limits the maximum sequence length to "
+                "max_num_batched_tokens and makes vLLM reject longer "
+                "sequences. Please increase max_num_batched_tokens or "
+                "decrease max_model_len.")
         if self.policy != "fcfs":
             raise NotImplementedError(
                 f"currently AscendScheduler only supports fcfs policy, got {self.policy}"