[v0.9.1][bugfix] disable the chunked prefill feature in Non-MLA LLMs (#2659)

rjg-lyh · web-flow · commit 47eaf622fe39 · 2025-09-03T15:27:43.000+08:00
### What this PR does / why we need it?
This PR enforces the forcible disabling of the chunked prefill feature
in Non-MLA models, as the performance of operators supporting this
functionality is currently suboptimal.
At the same time, in engine v1 mode, the ascend scheduler is forcibly
enabled, and the `enable_chunked_prefill` specified by the user in
additional_config is disabled.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
CI passed with new added/existing test.

Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -86,8 +86,8 @@
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-locale_dirs = ['locale/']   
-gettext_compact = False   
+locale_dirs = ['locale/']
+gettext_compact = False
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
diff --git a/docs/source/tutorials/large_scale_ep.md b/docs/source/tutorials/large_scale_ep.md
@@ -345,7 +345,7 @@ for process in processes:
 
 :::::
 
-Note that the prefiller nodes and the decoder nodes may have differenet configurations. In this example, each prefiller node deployed as master node independently, but all decoder nodes take the first node as the master node. So it leads to differents in 'dp_size_local' and 'dp_rank_start'
+Note that the prefiller nodes and the decoder nodes may have different configurations. In this example, each prefiller node deployed as master node independently, but all decoder nodes take the first node as the master node. So it leads to difference in 'dp_size_local' and 'dp_rank_start'
 
 ## Example proxy for Distributed DP Server
 
@@ -395,7 +395,7 @@ python load_balance_proxy_server_example.py \
 
 You can get the proxy program in the repository's examples, [load\_balance\_proxy\_server\_example.py](https://github.com/vllm-project/vllm-ascend/blob/v0.9.1-dev/examples/disaggregate_prefill_v1/load_balance_proxy_server_example.py)
 
-## Benckmark
+## Benchmark
 
 We recommend use aisbench tool to assess performance. [aisbench](https://gitee.com/aisbench/benchmark) Execute the following commands to install aisbench
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -35,6 +35,7 @@
 
 from tests.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
                                TokensTextLogprobsPromptLogprobs)
+from vllm_ascend.ascend_config import clear_ascend_config
 # TODO: remove this part after the patch merged into vllm, if
 # we not explicitly patch here, some of them might be effectiveless
 # in pytest scenario
@@ -348,6 +349,7 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         del self.model
+        clear_ascend_config()
         cleanup_dist_env_and_memory()
 
 
diff --git a/tests/multicard/test_model_qwen3_w4a8.py b/tests/multicard/test_model_qwen3_w4a8.py
@@ -25,6 +25,8 @@
 from modelscope import snapshot_download  # type: ignore
 from vllm import LLM, SamplingParams
 
+from vllm_ascend.ascend_config import clear_ascend_config
+
 MODELS = ["vllm-ascend/Qwen3-8B-W4A8"]
 PROMPTS = [
     "Hello, my name is",
@@ -38,6 +40,7 @@
 @pytest.mark.parametrize("max_tokens", [16])
 def test_qwen3_model_with_w4a8_linear_method(model: str,
                                              max_tokens: int) -> None:
+    clear_ascend_config()
     messages = [[{"role": "user", "content": prompt}] for prompt in PROMPTS]
     sampling_params = SamplingParams(
         max_tokens=max_tokens,
@@ -63,3 +66,4 @@ def test_qwen3_model_with_w4a8_linear_method(model: str,
     for vllm_output, golden_output in zip(vllm_outputs, golden_outputs):
         assert vllm_output.outputs[0].text == golden_output
         print(f"Generated text: {vllm_output.outputs[0].text!r}")
+    clear_ascend_config()
diff --git a/tests/singlecard/core/test_ascend_scheduler.py b/tests/singlecard/core/test_ascend_scheduler.py
@@ -22,7 +22,7 @@
 
 
 def create_scheduler(
-    model: str = "Qwen/Qwen2.5-0.5B-Instruct",
+    model: str = "deepseek-ai/DeepSeek-V2-Lite",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
     enable_prefix_caching: Optional[bool] = None,
@@ -60,6 +60,7 @@ def create_scheduler(
     )
     model_config = ModelConfig(
         model=model,
+        enforce_eager=True,
         task="auto",
         tokenizer=model,
         tokenizer_mode="auto",
@@ -227,7 +228,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
 
     """
     scheduler = create_scheduler(
-        model="facebook/opt-125m",
+        model="deepseek-ai/DeepSeek-V2-Lite",
         max_num_batched_tokens=1024,
         long_prefill_token_threshold=400,
         enable_prefix_caching=enable_prefix_caching,
diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py
@@ -41,7 +41,9 @@ def test_run_without_ascend_config():
         assert not ascend_config.torchair_graph_config.use_cached_graph
         assert ascend_config.torchair_graph_config.graph_batch_sizes == []
         assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
-        assert not ascend_config.ascend_scheduler_config.enabled
+        # Non-MLA LLMs forcibly disable the chunked prefill feature
+        # and use AscendScheduler
+        assert ascend_config.ascend_scheduler_config.enabled
 
 
 @_clean_up_ascend_config
@@ -81,7 +83,8 @@ def test_run_with_ascend_config():
         assert not ascend_config.torchair_graph_config.enable_multistream_moe
         assert not ascend_config.torchair_graph_config.enable_view_optimize
         assert ascend_config.ascend_scheduler_config.enabled
-        assert ascend_config.ascend_scheduler_config.enable_chunked_prefill
+        # Non-MLA LLMs forcibly disable the chunked prefill feature
+        assert not ascend_config.ascend_scheduler_config.enable_chunked_prefill
 
 
 @_clean_up_ascend_config
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -127,6 +127,34 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
         cache_config = vllm_config.cache_config
+        decoding_config = vllm_config.decoding_config
+        scheduler_config = vllm_config.scheduler_config
+        ascend_scheduler_config = ascend_config.ascend_scheduler_config
+
+        if model_config is not None and not model_config.use_mla:
+            logger.info(
+                "Non-MLA LLMs forcibly disable the chunked prefill feature,"
+                "as the performance of operators supporting this feature "
+                "functionality is currently suboptimal.")
+            if not envs.VLLM_USE_V1:
+                scheduler_config.enable_chunked_prefill = False
+                scheduler_config.chunked_prefill_enabled = False
+            if envs.VLLM_USE_V1 and \
+                not model_config.is_multimodal_model and \
+                decoding_config.backend == "auto" and \
+                not scheduler_config.delay_factor > 0 and \
+                not scheduler_config.send_delta_data and \
+                scheduler_config.policy == "fcfs" and \
+                scheduler_config.num_scheduler_steps == 1:
+                scheduler_config.enable_chunked_prefill = False
+                scheduler_config.chunked_prefill_enabled = False
+                ascend_scheduler_config.enabled = True
+                if hasattr(ascend_scheduler_config, "enable_chunked_prefill"):
+                    ascend_scheduler_config.enable_chunked_prefill = False
+            if (scheduler_config.max_num_batched_tokens <
+                    scheduler_config.max_model_len
+                    and not scheduler_config.chunked_prefill_enabled):
+                scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
 
         if parallel_config:
             if parallel_config.enable_expert_parallel: