vllm-project · whx-sjtu · Aug 13, 2025 · Aug 14, 2025 · Aug 14, 2025 · Aug 15, 2025
diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/additional_config.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/additional_config.po
@@ -148,9 +148,6 @@ msgid ""
 " to be passed in."
 msgstr "在为MOE模型使用专家负载均衡时，需要传入专家映射路径。"
 
-#: ../../user_guide/configuration/additional_config.md
-msgid "`chunked_prefill_for_mla`"
-msgstr "`chunked_prefill_for_mla`"
 
 #: ../../user_guide/configuration/additional_config.md
 msgid "`False`"

diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md
@@ -30,7 +30,6 @@ The following table lists the additional configuration options available in vLLM
 | `ascend_scheduler_config`     | dict | `{}` | The config options for ascend scheduler                                                       |
 | `refresh`                     | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf or ut/e2e test case.     |
 | `expert_map_path`             | str  | `None` | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
-| `chunked_prefill_for_mla`     | bool | `False` | Whether to enable the fused operator-like chunked_prefill. |
 | `kv_cache_dtype`     | str | `None` | When using the kv cache quantization method, kv cache dtype needs to be set, currently only int8 is supported. |
 | `enable_shared_expert_dp`     | bool | `True` | When the shared expert in DP, it has better performance but consumes more memory. When the memory is sensitive, this switch can be turned off manually. |
 

diff --git a/examples/disaggregated_prefill_v1/README.md b/examples/disaggregated_prefill_v1/README.md
@@ -71,8 +71,6 @@ vllm serve /models/deepseek_r1_w8a8 \
   "engine_id": "0",
   "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
   }'  \
-  --additional-config \
-  '{"chunked_prefill_for_mla":true}' 
 ```
 
 Run prefill server P2 on second node:
@@ -115,8 +113,6 @@ vllm serve /models/deepseek_r1_w8a8 \
   "engine_id": "0",
   "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
   }'  \
-  --additional-config \
-  '{"chunked_prefill_for_mla":true}'
 ```
 
 Run decode server d1 on third node:

diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py
@@ -523,8 +523,11 @@ def test_compute_prefill_context_none(self):
         metadata.prefill = None
         prefix_out = torch.randn(2, 16, 128)
         prefix_lse = torch.randn(2, 16, 8)
-        out, lse = self.impl._compute_prefill_context(query, kv_cache, 32,
-                                                      metadata, prefix_out,
+        q_pe = query[..., self.impl.qk_nope_head_dim:]
+        q_nope = query[..., :self.impl.qk_nope_head_dim]
+
+        out, lse = self.impl._compute_prefill_context(q_nope, q_pe, kv_cache,
+                                                      32, metadata, prefix_out,
                                                       prefix_lse)
 
         self.assertTrue(torch.equal(prefix_out, out))
@@ -538,6 +541,8 @@ def test_compute_prefill_context(self, mock_ring, mock_load):
         latent_kv_dim = self.impl.kv_lora_rank
         num_blocks, block_size = 100, 20
         query = torch.randn(S, N, D)
+        q_nope = query[..., :self.impl.qk_nope_head_dim]
+        q_pe = query[..., self.impl.qk_nope_head_dim:]
         kv_cache_0 = torch.randn(num_blocks, block_size, N, latent_kv_dim)
         kv_cache_1 = torch.randn(num_blocks, block_size, N, D)
         kv_cache = [kv_cache_0, kv_cache_1]
@@ -559,7 +564,7 @@ def test_compute_prefill_context(self, mock_ring, mock_load):
         meta = MagicMock()
         meta.prefill = prefill_meta
 
-        out, lse = self.impl._compute_prefill_context(query, kv_cache, 32,
+        out, lse = self.impl._compute_prefill_context(q_nope, q_pe, kv_cache, 32,
                                                       meta, prefix_out,
                                                       prefix_lse)
 

diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -45,11 +45,12 @@ def __init__(self, vllm_config):
             ascend_scheduler_config)
 
         self.expert_map_path = additional_config.get("expert_map_path", None)
-        self.chunked_prefill_for_mla = additional_config.get(
-            "chunked_prefill_for_mla", False)
         self.enable_shared_expert_dp = additional_config.get(
             "enable_shared_expert_dp", True
         ) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel
+        self.enable_mla_prefetch = additional_config.get(
+            "enable_mla_prefetch", True
+        )
 
 
 class TorchairGraphConfig: