Skip to content

Commit 47eaf62

Browse files
authored
[v0.9.1][bugfix] disable the chunked prefill feature in Non-MLA LLMs (#2659)
### What this PR does / why we need it? This PR enforces the forcible disabling of the chunked prefill feature in Non-MLA models, as the performance of operators supporting this functionality is currently suboptimal. At the same time, in engine v1 mode, the ascend scheduler is forcibly enabled, and the `enable_chunked_prefill` specified by the user in additional_config is disabled. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI passed with new added/existing test. Signed-off-by: rjg-lyh <[email protected]>
1 parent 5926225 commit 47eaf62

File tree

7 files changed

+46
-8
lines changed

7 files changed

+46
-8
lines changed

docs/source/conf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@
8686
#
8787
# This is also used if you do content translation via gettext catalogs.
8888
# Usually you set "language" from the command line for these cases.
89-
locale_dirs = ['locale/']
90-
gettext_compact = False
89+
locale_dirs = ['locale/']
90+
gettext_compact = False
9191
# List of patterns, relative to source directory, that match files and
9292
# directories to ignore when looking for source files.
9393
# This pattern also affects html_static_path and html_extra_path.

docs/source/tutorials/large_scale_ep.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ for process in processes:
345345

346346
:::::
347347

348-
Note that the prefiller nodes and the decoder nodes may have differenet configurations. In this example, each prefiller node deployed as master node independently, but all decoder nodes take the first node as the master node. So it leads to differents in 'dp_size_local' and 'dp_rank_start'
348+
Note that the prefiller nodes and the decoder nodes may have different configurations. In this example, each prefiller node deployed as master node independently, but all decoder nodes take the first node as the master node. So it leads to difference in 'dp_size_local' and 'dp_rank_start'
349349

350350
## Example proxy for Distributed DP Server
351351

@@ -395,7 +395,7 @@ python load_balance_proxy_server_example.py \
395395

396396
You can get the proxy program in the repository's examples, [load\_balance\_proxy\_server\_example.py](https://github.com/vllm-project/vllm-ascend/blob/v0.9.1-dev/examples/disaggregate_prefill_v1/load_balance_proxy_server_example.py)
397397

398-
## Benckmark
398+
## Benchmark
399399

400400
We recommend use aisbench tool to assess performance. [aisbench](https://gitee.com/aisbench/benchmark) Execute the following commands to install aisbench
401401

tests/conftest.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535

3636
from tests.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
3737
TokensTextLogprobsPromptLogprobs)
38+
from vllm_ascend.ascend_config import clear_ascend_config
3839
# TODO: remove this part after the patch merged into vllm, if
3940
# we not explicitly patch here, some of them might be effectiveless
4041
# in pytest scenario
@@ -348,6 +349,7 @@ def __enter__(self):
348349

349350
def __exit__(self, exc_type, exc_value, traceback):
350351
del self.model
352+
clear_ascend_config()
351353
cleanup_dist_env_and_memory()
352354

353355

tests/multicard/test_model_qwen3_w4a8.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
from modelscope import snapshot_download # type: ignore
2626
from vllm import LLM, SamplingParams
2727

28+
from vllm_ascend.ascend_config import clear_ascend_config
29+
2830
MODELS = ["vllm-ascend/Qwen3-8B-W4A8"]
2931
PROMPTS = [
3032
"Hello, my name is",
@@ -38,6 +40,7 @@
3840
@pytest.mark.parametrize("max_tokens", [16])
3941
def test_qwen3_model_with_w4a8_linear_method(model: str,
4042
max_tokens: int) -> None:
43+
clear_ascend_config()
4144
messages = [[{"role": "user", "content": prompt}] for prompt in PROMPTS]
4245
sampling_params = SamplingParams(
4346
max_tokens=max_tokens,
@@ -63,3 +66,4 @@ def test_qwen3_model_with_w4a8_linear_method(model: str,
6366
for vllm_output, golden_output in zip(vllm_outputs, golden_outputs):
6467
assert vllm_output.outputs[0].text == golden_output
6568
print(f"Generated text: {vllm_output.outputs[0].text!r}")
69+
clear_ascend_config()

tests/singlecard/core/test_ascend_scheduler.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323

2424
def create_scheduler(
25-
model: str = "Qwen/Qwen2.5-0.5B-Instruct",
25+
model: str = "deepseek-ai/DeepSeek-V2-Lite",
2626
max_num_seqs: int = 16,
2727
max_num_batched_tokens: int = 8192,
2828
enable_prefix_caching: Optional[bool] = None,
@@ -60,6 +60,7 @@ def create_scheduler(
6060
)
6161
model_config = ModelConfig(
6262
model=model,
63+
enforce_eager=True,
6364
task="auto",
6465
tokenizer=model,
6566
tokenizer_mode="auto",
@@ -227,7 +228,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
227228
228229
"""
229230
scheduler = create_scheduler(
230-
model="facebook/opt-125m",
231+
model="deepseek-ai/DeepSeek-V2-Lite",
231232
max_num_batched_tokens=1024,
232233
long_prefill_token_threshold=400,
233234
enable_prefix_caching=enable_prefix_caching,

tests/singlecard/test_ascend_config.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ def test_run_without_ascend_config():
4141
assert not ascend_config.torchair_graph_config.use_cached_graph
4242
assert ascend_config.torchair_graph_config.graph_batch_sizes == []
4343
assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
44-
assert not ascend_config.ascend_scheduler_config.enabled
44+
# Non-MLA LLMs forcibly disable the chunked prefill feature
45+
# and use AscendScheduler
46+
assert ascend_config.ascend_scheduler_config.enabled
4547

4648

4749
@_clean_up_ascend_config
@@ -81,7 +83,8 @@ def test_run_with_ascend_config():
8183
assert not ascend_config.torchair_graph_config.enable_multistream_moe
8284
assert not ascend_config.torchair_graph_config.enable_view_optimize
8385
assert ascend_config.ascend_scheduler_config.enabled
84-
assert ascend_config.ascend_scheduler_config.enable_chunked_prefill
86+
# Non-MLA LLMs forcibly disable the chunked prefill feature
87+
assert not ascend_config.ascend_scheduler_config.enable_chunked_prefill
8588

8689

8790
@_clean_up_ascend_config

vllm_ascend/platform.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,34 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
127127
model_config = vllm_config.model_config
128128
parallel_config = vllm_config.parallel_config
129129
cache_config = vllm_config.cache_config
130+
decoding_config = vllm_config.decoding_config
131+
scheduler_config = vllm_config.scheduler_config
132+
ascend_scheduler_config = ascend_config.ascend_scheduler_config
133+
134+
if model_config is not None and not model_config.use_mla:
135+
logger.info(
136+
"Non-MLA LLMs forcibly disable the chunked prefill feature,"
137+
"as the performance of operators supporting this feature "
138+
"functionality is currently suboptimal.")
139+
if not envs.VLLM_USE_V1:
140+
scheduler_config.enable_chunked_prefill = False
141+
scheduler_config.chunked_prefill_enabled = False
142+
if envs.VLLM_USE_V1 and \
143+
not model_config.is_multimodal_model and \
144+
decoding_config.backend == "auto" and \
145+
not scheduler_config.delay_factor > 0 and \
146+
not scheduler_config.send_delta_data and \
147+
scheduler_config.policy == "fcfs" and \
148+
scheduler_config.num_scheduler_steps == 1:
149+
scheduler_config.enable_chunked_prefill = False
150+
scheduler_config.chunked_prefill_enabled = False
151+
ascend_scheduler_config.enabled = True
152+
if hasattr(ascend_scheduler_config, "enable_chunked_prefill"):
153+
ascend_scheduler_config.enable_chunked_prefill = False
154+
if (scheduler_config.max_num_batched_tokens <
155+
scheduler_config.max_model_len
156+
and not scheduler_config.chunked_prefill_enabled):
157+
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
130158

131159
if parallel_config:
132160
if parallel_config.enable_expert_parallel:

0 commit comments

Comments
 (0)