Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,6 @@ msgid ""
" to be passed in."
msgstr "在为MOE模型使用专家负载均衡时,需要传入专家映射路径。"

#: ../../user_guide/configuration/additional_config.md
msgid "`chunked_prefill_for_mla`"
msgstr "`chunked_prefill_for_mla`"

#: ../../user_guide/configuration/additional_config.md
msgid "`False`"
Expand Down
1 change: 0 additions & 1 deletion docs/source/user_guide/configuration/additional_config.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ The following table lists the additional configuration options available in vLLM
| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler |
| `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf or ut/e2e test case. |
| `expert_map_path` | str | `None` | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
| `chunked_prefill_for_mla` | bool | `False` | Whether to enable the fused operator-like chunked_prefill. |
| `kv_cache_dtype` | str | `None` | When using the kv cache quantization method, kv cache dtype needs to be set, currently only int8 is supported. |
| `enable_shared_expert_dp` | bool | `True` | When the shared expert in DP, it has better performance but consumes more memory. When the memory is sensitive, this switch can be turned off manually. |

Expand Down
4 changes: 0 additions & 4 deletions examples/disaggregated_prefill_v1/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,6 @@ vllm serve /models/deepseek_r1_w8a8 \
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}' \
--additional-config \
'{"chunked_prefill_for_mla":true}'
```

Run prefill server P2 on second node:
Expand Down Expand Up @@ -115,8 +113,6 @@ vllm serve /models/deepseek_r1_w8a8 \
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}' \
--additional-config \
'{"chunked_prefill_for_mla":true}'
```

Run decode server d1 on third node:
Expand Down
11 changes: 8 additions & 3 deletions tests/ut/attention/test_mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,8 +523,11 @@ def test_compute_prefill_context_none(self):
metadata.prefill = None
prefix_out = torch.randn(2, 16, 128)
prefix_lse = torch.randn(2, 16, 8)
out, lse = self.impl._compute_prefill_context(query, kv_cache, 32,
metadata, prefix_out,
q_pe = query[..., self.impl.qk_nope_head_dim:]
q_nope = query[..., :self.impl.qk_nope_head_dim]

out, lse = self.impl._compute_prefill_context(q_nope, q_pe, kv_cache,
32, metadata, prefix_out,
prefix_lse)

self.assertTrue(torch.equal(prefix_out, out))
Expand All @@ -538,6 +541,8 @@ def test_compute_prefill_context(self, mock_ring, mock_load):
latent_kv_dim = self.impl.kv_lora_rank
num_blocks, block_size = 100, 20
query = torch.randn(S, N, D)
q_nope = query[..., :self.impl.qk_nope_head_dim]
q_pe = query[..., self.impl.qk_nope_head_dim:]
kv_cache_0 = torch.randn(num_blocks, block_size, N, latent_kv_dim)
kv_cache_1 = torch.randn(num_blocks, block_size, N, D)
kv_cache = [kv_cache_0, kv_cache_1]
Expand All @@ -559,7 +564,7 @@ def test_compute_prefill_context(self, mock_ring, mock_load):
meta = MagicMock()
meta.prefill = prefill_meta

out, lse = self.impl._compute_prefill_context(query, kv_cache, 32,
out, lse = self.impl._compute_prefill_context(q_nope, q_pe, kv_cache, 32,
meta, prefix_out,
prefix_lse)

Expand Down
5 changes: 3 additions & 2 deletions vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,12 @@ def __init__(self, vllm_config):
ascend_scheduler_config)

self.expert_map_path = additional_config.get("expert_map_path", None)
self.chunked_prefill_for_mla = additional_config.get(
"chunked_prefill_for_mla", False)
self.enable_shared_expert_dp = additional_config.get(
"enable_shared_expert_dp", True
) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel
self.enable_mla_prefetch = additional_config.get(
"enable_mla_prefetch", True
)


class TorchairGraphConfig:
Expand Down
Loading
Loading