[Bugfix] fix the oom when chunkprefill with long context like 64k (#2319)

haojiangzheng · web-flow · commit 0f7492d18e57 · 2025-08-13T17:15:59.000+08:00
The attn mask was declared in the mla.py，we don't need the splitfuse mask when mla chunkprefill, and this mask will cause memory problem when long context like 64k or 128k - vLLM version: v0.10.0 - vLLM main: vllm-project/vllm@14a5d90 --------- Signed-off-by: haojiangzheng <justineric096@gmail.com>
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -842,7 +842,7 @@ def get_supported_tasks(self) -> "tuple[SupportedTask, ...]":
     def _make_attention_mask(self, seq_lens, query_lens, position,
                              attn_state) -> torch.Tensor:
         # Chunk Prefill situation.
-        if attn_state == AscendAttentionState.ChunkedPrefill:
+        if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla:
             return self.attn_mask_builder.get_splitfuse_attn_mask(
                 seq_lens, query_lens, position, self.dtype, self.device)
         # Prefill without cache situation.