[0.9.1]remove chunked_prefill_for_mla (#2177)

fems14 · web-flow · commit 6d9e5f6e0b7c · 2025-08-06T23:24:36.000+08:00
### What this PR does / why we need it?
 remove chunked_prefill_for_mla


### Does this PR introduce _any_ user-facing change?


### How was this patch tested?
Processed prompts: 100%|??????????| 4/4 [00:02&lt;00:00, 1.92it/s, est.
speed input: 12.46 toks/s, output: 38.34 toks/s]
DP rank 2, Generated text: ' [Your Name] and I am a professional
carpenter with over 10 years of experience in the industry'
DP rank 2, Generated text: ' the head of state and head of government of
the United States, indirectly elected to a four-year term'
DP rank 2, Generated text: ' Paris, a city that is renowned for its rich
history, culture, and influence on art, fashion'
DP rank 2, Generated text: ' a topic of much speculation and debate.
Some experts believe that AI will eventually surpass human intelligence,
while'
Processed prompts: 100%|??????????| 4/4 [00:02&lt;00:00, 1.95it/s, est.
speed input: 12.65 toks/s, output: 38.93 toks/s]
DP rank 0, Generated text: " Dr. David Hill and today we're going to be
talking about how to treat a child with a"
DP rank 0, Generated text: ' the head of state and head of government of
the United States, indirectly elected to a four-year term'
DP rank 0, Generated text: ' Paris, a city that is renowned for its rich
history, culture, and influence on art, fashion'
DP rank 0, Generated text: ' here, and it’s called ChatGPT. This
revolutionary technology is changing the way we interact with machines'
Processed prompts: 100%|??????????| 4/4 [00:02&lt;00:00, 1.97it/s, est.
speed input: 12.79 toks/s, output: 39.36 toks/s]

DP rank 1, Generated text: " Dr. David Hill and today we're going to be
talking about how to treat a child's fever"
DP rank 3, Generated text: ' [Your Name] and I’m here to talk to you
about the importance of a healthy diet'
DP rank 1, Generated text: ' the head of state and head of government of
the United States, indirectly elected to a four-year term'
DP rank 1, Generated text: ' Paris, a city that is renowned for its rich
history, culture, and influence on art, fashion'
DP rank 1, Generated text: ' a topic of much speculation and debate.
Some experts believe that AI will eventually surpass human intelligence,
leading'
DP rank 3, Generated text: ' the head of state and head of government of
the United States, indirectly elected to a four-year term'
DP rank 3, Generated text: " Paris. It is the largest city in France and
serves as the country's political, cultural, and"
DP rank 3, Generated text: ' here, and it’s called ChatGPT. This
revolutionary technology is changing the way we interact with machines

---------

Signed-off-by: fems14 &lt;1804143737@qq.com&gt;
diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md
@@ -30,7 +30,6 @@ The following table lists the additional configuration options available in vLLM
 | `ascend_scheduler_config`     | dict | `{}` | The config options for ascend scheduler                                                       |
 | `refresh`                     | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf or ut/e2e test case.     |
 | `expert_map_path`             | str  | `None` | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
-| `chunked_prefill_for_mla`     | bool | `False` | Whether to enable the fused operator-like chunked_prefill. |
 | `kv_cache_dtype`     | str | `None` | When using the kv cache quantization method, kv cache dtype needs to be set, currently only int8 is supported. |
 
 The details of each config option are as follows:
diff --git a/examples/disaggregate_prefill_v1/README.md b/examples/disaggregate_prefill_v1/README.md
@@ -71,8 +71,6 @@ vllm serve /models/deepseek_r1_w8a8 \
   "engine_id": "0",
   "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
   }'  \
-  --additional-config \
-  '{"chunked_prefill_for_mla":true}' 
 ```
 
 Run prefill server P2 on second node:
@@ -115,8 +113,6 @@ vllm serve /models/deepseek_r1_w8a8 \
   "engine_id": "0",
   "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
   }'  \
-  --additional-config \
-  '{"chunked_prefill_for_mla":true}'
 ```
 
 Run decode server d1 on third node:
diff --git a/tests/multicard/test_torchair_graph_mode.py b/tests/multicard/test_torchair_graph_mode.py
@@ -71,9 +71,9 @@ def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch,
         # inaccurate. This will only change if accuracy improves with the
         # official weights of DeepSeek-V3.
         golden_results = [
-            'Hello, my name is下载早点向前很有่อง',
-            'The president of the United States isSender)## physiological Albany',
-            'The capital of France is Rocky转角 hospitalizedinterval sparked',
+            'Hello, my name is bioavailability裹格外 struct',
+            'The president of the United States isStr Fiona tratamientoPant narciss',
+            'The capital of France is Rocky转角){\\-Hill偷袭',
             'The future of AI is её asegο BIOS一扫',
         ]
 
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -54,9 +54,6 @@ def __init__(self, vllm_config):
         self.num_wait_worker_iterations = additional_config.get(
             "num_wait_worker_iterations", 30
         )  # Number of iterations to wait before applying a redistribution plan
-        self.chunked_prefill_for_mla = additional_config.get(
-            "chunked_prefill_for_mla",
-            False)  # Whether to enable the fused operator-like chunked_prefill
         self.enable_weight_nz_layout = additional_config.get(
             "enable_weight_nz_layout", False
         )  # Whether to convert quantized weights to NZ format to accelerate matrix multiplication
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -21,7 +21,6 @@
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
-from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
 from vllm_ascend.utils import npu_prefetch, npu_stream_switch, npu_wait_tensor
 
 if TYPE_CHECKING:
@@ -211,6 +210,9 @@ def __init__(self,
         self.rope_dim = self.runner.model_config.hf_text_config.qk_rope_head_dim
         self.cos_cache = None
         self.sin_cache = None
+        self.prefill_attn_mask = torch.triu(
+            torch.ones(512, 512, device=runner.device, dtype=runner.dtype),
+            1)  # 512: mask only support 512
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
@@ -479,7 +481,7 @@ def build(
                 prefill_input_positions].unsqueeze(  # type: ignore
                     1).unsqueeze(2)
             prefill_metadata = AscendMLAPrefillMetadata(
-                attn_mask=self.runner.attn_mask,
+                attn_mask=self.prefill_attn_mask,
                 query_lens=query_lens[tokens_start:],
                 seq_lens=seq_lens,
                 context_lens=seq_lens[tokens_start:],
@@ -767,16 +769,13 @@ def _compute_prefill_context(
             k_nope, v = kv_nope\
                 .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
             k_pe = k_pe.expand((*k_nope.shape[:-1], -1))
-            mask = torch.triu(
-                torch.ones(512, 512, device=query.device, dtype=query.dtype),
-                1)
             torch_npu.atb.npu_ring_mla(
                 q_nope=q_nope,
                 q_rope=q_pe,
                 k_nope=k_nope,
                 k_rope=k_pe,
                 value=v,
-                mask=mask,
+                mask=prefill_metadata.attn_mask,
                 seqlen=seq_len,
                 head_num=self.num_heads,
                 kv_head_num=self.num_heads,
@@ -808,101 +807,40 @@ def _forward_prefill(
                                   self.v_head_dim,
                                   dtype=query.dtype,
                                   device=query.device)
+        attn_lse = torch.empty(self.num_heads,
+                               num_tokens,
+                               dtype=torch.float32,
+                               device=query.device)
         k_nope, value = self.kv_b_proj(kv_c_normed)[0].view(
             -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim).split(
                 [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
         k_pe = k_pe.expand((*k_nope.shape[:-1], -1))
-        # Here is only 2 possibility of input, ChunkedPrefill or PrefillNoCache
-        ascend_config = get_ascend_config()
-
-        if attn_metadata.attn_state in [
-                AscendAttentionState.ChunkedPrefill,
-                AscendAttentionState.SpecDecoding,
-                AscendAttentionState.PrefillCacheHit
-        ] and not ascend_config.chunked_prefill_for_mla:
-            attn_output_torch = torch.empty(num_tokens,
-                                            self.num_heads * self.v_head_dim,
-                                            dtype=query.dtype,
-                                            device=query.device)
-            # current requests is chunked in prefill, disable flash attention with chunked prefill
-            vanilla_chunked_prefill_mla(
-                output=attn_output_torch,
-                query=query,
-                kv_cache=kv_c_and_k_pe_cache,
-                block_tables=attn_metadata.prefill.block_table,
-                query_lens=attn_metadata.prefill.query_lens,
-                context_lens=attn_metadata.prefill.context_lens,
-                kv_b_proj=self.kv_b_proj,
-                max_query_len=attn_metadata.prefill.max_query_len,
-                max_context_len=attn_metadata.prefill.max_seq_lens,
-                nope_dim=self.qk_nope_head_dim,
-                rope_dim=self.qk_rope_head_dim,
-                v_head_dim=self.v_head_dim,
-                scale=self.scale,
-                alibi_slopes=None,
-                causal=True)
-        elif attn_metadata.attn_state in [
-                AscendAttentionState.ChunkedPrefill,
-                AscendAttentionState.SpecDecoding,
-                AscendAttentionState.PrefillCacheHit
-        ]:
-            attn_lse = torch.empty(self.num_heads,
-                                   num_tokens,
-                                   dtype=torch.float32,
-                                   device=query.device)
-            q_pe = query[..., self.qk_nope_head_dim:]
-            q_nope = query[..., :self.qk_nope_head_dim]
-            mask = torch.triu(
-                torch.ones(512, 512, device=query.device, dtype=query.dtype),
-                1)  # 512: mask only support 512
-            if attn_metadata.num_prefills > 1:
-                mask = mask.unsqueeze(0).repeat(attn_metadata.num_prefills, 1,
-                                                1)
-            torch_npu.atb.npu_ring_mla(
-                q_nope=q_nope,
-                q_rope=q_pe,
-                k_nope=k_nope,
-                k_rope=k_pe,
-                value=value,
-                mask=mask,
-                seqlen=torch.tensor(attn_metadata.prefill.query_lens,
-                                    dtype=torch.int32),
-                head_num=self.num_heads,
-                kv_head_num=self.num_heads,
-                pre_out=None,
-                prev_lse=None,
-                qk_scale=self.scale,
-                kernel_type="kernel_type_high_precision",
-                mask_type="mask_type_triu",
-                input_layout="type_bsnd",
-                calc_type="calc_type_first_ring",
-                output=attn_output,
-                softmax_lse=attn_lse)
-            attn_output, attn_lse = self._compute_prefill_context( \
-                query, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse)
-
-        elif attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
-            key = torch.cat((k_nope, k_pe), dim=-1)
-            torch_npu._npu_flash_attention(
-                query=query,
-                key=key,
-                value=value,
-                mask=attn_metadata.attn_mask,
-                seq_len=attn_metadata.prefill.context_lens,
-                scale_value=self.scale,
-                num_heads=self.num_heads,
-                num_kv_heads=self.num_heads,
-                out=attn_output)
-            attn_output = attn_output.view(-1, self.num_heads, self.v_head_dim)
+        q_pe = query[..., self.qk_nope_head_dim:]
+        q_nope = query[..., :self.qk_nope_head_dim]
+        torch_npu.atb.npu_ring_mla(q_nope=q_nope,
+                                   q_rope=q_pe,
+                                   k_nope=k_nope,
+                                   k_rope=k_pe,
+                                   value=value,
+                                   mask=attn_metadata.prefill.attn_mask,
+                                   seqlen=torch.tensor(
+                                       attn_metadata.prefill.query_lens,
+                                       dtype=torch.int32),
+                                   head_num=self.num_heads,
+                                   kv_head_num=self.num_heads,
+                                   pre_out=None,
+                                   prev_lse=None,
+                                   qk_scale=self.scale,
+                                   kernel_type="kernel_type_high_precision",
+                                   mask_type="mask_type_triu",
+                                   input_layout="type_bsnd",
+                                   calc_type="calc_type_first_ring",
+                                   output=attn_output,
+                                   softmax_lse=attn_lse)
+        attn_output, attn_lse = self._compute_prefill_context( \
+            query, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse)
         attn_output = attn_output.reshape(
             [num_tokens, self.num_heads * self.v_head_dim])
-        if attn_metadata.attn_state in [
-                AscendAttentionState.ChunkedPrefill,
-                AscendAttentionState.SpecDecoding,
-                AscendAttentionState.PrefillCacheHit
-        ] and not ascend_config.chunked_prefill_for_mla:
-            attn_output = attn_output_torch
-
         return attn_output
 
     def exec_kv(
diff --git a/vllm_ascend/multistream/ms_split.py b/vllm_ascend/multistream/ms_split.py
@@ -167,7 +167,7 @@ def model_input_split_v1_mla_attn(
             attn_metadata.prefill.sin,
             token_index - attn_metadata.num_decode_tokens)
         prefill_pre = AscendMLAPrefillMetadata(
-            attn_mask=attn_mask_pre,
+            attn_mask=attn_metadata.prefill.attn_mask,
             query_lens=prefill_query_lens_pre,
             seq_lens=seq_lens_pre,
             query_start_loc=prefill_query_start_loc_pre,
@@ -179,7 +179,7 @@ def model_input_split_v1_mla_attn(
             cos=cos_pre,
             sin=sin_pre)
         prefill_post = AscendMLAPrefillMetadata(
-            attn_mask=attn_mask_post,
+            attn_mask=attn_metadata.prefill.attn_mask,
             query_lens=prefill_query_lens_post,
             seq_lens=seq_lens_post,
             query_start_loc=prefill_query_start_loc_post,
diff --git a/vllm_ascend/ops/attention.py b/vllm_ascend/ops/attention.py
@@ -15,10 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional, Tuple
+from typing import List, Optional
 
 import torch
-from vllm.model_executor.layers.linear import ColumnParallelLinear
 
 
 # Implementation of vanilla chunked prefill, should be removed after the kernel is ready for
@@ -135,138 +134,6 @@ def vanilla_chunked_prefill(
     return attn_output
 
 
-def vanilla_chunked_prefill_mla(
-        output: torch.Tensor,  # (num_tokens, num_heads, v_head_dim)
-        query: torch.Tensor,  # (num_tokens, num_heads, nope_dim + rope_dim)
-        kv_cache: Tuple[
-            torch.Tensor],  # [nope, rope] (num_blocks, block_size, latent_kv)
-        block_tables: torch.Tensor,  # (batch_size, max_num_blocks_per_seq)
-        query_lens: torch.Tensor,  # (batch_size)
-        context_lens: torch.Tensor,  # (batch_size)
-        kv_b_proj: ColumnParallelLinear,  # ()
-        max_query_len: int,
-        max_context_len: int,
-        nope_dim: int,
-        rope_dim: int,
-        v_head_dim: int,
-        scale: float,
-        alibi_slopes: Optional[torch.Tensor],
-        causal: bool = True) -> None:
-    batch_size = block_tables.size(0)
-    assert len(kv_cache) > 1
-    assert query_lens.size(0) == batch_size
-    num_heads = query.size(1)
-    nope_cache = kv_cache[0]
-    rope_cache = kv_cache[1]
-    block_size = nope_cache.size(1)
-    latent_kv_dim = nope_cache.size(-1)
-    max_num_blocks_per_seq = block_tables.size(1)
-    batch_size = query_lens.size(0)
-    nope_cache = nope_cache.squeeze()
-    # select kv_c out as [batch_size, max_context_len, latent_kv + rope_dim] and get kv_c and k_pe
-    # cached_kv_c: [batch_size, max_context_len, latent_kv]
-    # cached_k_pe: [batch_size, max_context_len, rope_dim]
-    cache_kv_c = nope_cache[block_tables].view(
-        batch_size, max_num_blocks_per_seq * block_size,
-        latent_kv_dim)[:, :max_context_len, :]
-    cache_k_pe = rope_cache[block_tables].view(
-        batch_size, max_num_blocks_per_seq * block_size,
-        rope_dim)[:, :max_context_len, :]
-    # get k_rope and v
-    # k_nope: [batch_size, max_context_len, num_heads, nope_dim]
-    # value:  [batch_size, max_context_len, num_heads, v_head_dim]
-    k_nope, value = kv_b_proj(cache_kv_c)[0].view(
-        batch_size, max_context_len, num_heads,
-        nope_dim + v_head_dim).split([nope_dim, v_head_dim], dim=-1)
-    # key:    [batch_size, max_context_len, num_hads, rope_dim + nope_dim]
-    key = torch.cat(
-        [k_nope, cache_k_pe.unsqueeze(2).expand(-1, -1, num_heads, -1)],
-        dim=-1)
-
-    context_lens = context_lens.view(-1, 1).to("npu")
-    query_lens = query_lens.view(-1, 1).to("npu")
-    seq_diff = context_lens - query_lens
-
-    q_idx_mask = (torch.arange(0, max_query_len,
-                               device="npu").view(1, -1).repeat(batch_size, 1))
-    kv_c_idx_mask = (torch.arange(0, max_context_len,
-                                  device="npu").view(1,
-                                                     -1).repeat(batch_size, 1))
-    kv_c_mask = kv_c_idx_mask < context_lens
-    q_mask = q_idx_mask < query_lens
-
-    # calculate idx for causal mask of query    [batch, max_seqlen_q]
-    causal_mask_idx = (q_idx_mask + seq_diff)[q_mask]
-
-    # generate causal mask [batch, max_seqlen_q, max_seqlen_k]
-    tril_mask = torch.tril(
-        torch.ones(max_context_len, max_context_len, device="npu"))
-    tril_mask[tril_mask == 0] = float("-inf")
-    tril_mask[tril_mask == 1] = 0
-    causal_mask = tril_mask[causal_mask_idx]
-    causal_mask_padding = torch.empty(
-        [batch_size, max_query_len, max_context_len],
-        device="npu").fill_(float("-inf"))
-    causal_mask_padding[q_mask] = causal_mask
-    # to [batch, num_heads, max_seqlen_q, max_seqlen_k]
-    causal_mask_padding = causal_mask_padding.unsqueeze(1)
-
-    pad_q = torch.zeros(
-        [batch_size, max_query_len, num_heads, rope_dim + nope_dim],
-        device="npu",
-        dtype=query.dtype,
-    )
-    pad_k = torch.zeros(
-        [batch_size, max_context_len, num_heads, rope_dim + nope_dim],
-        device="npu",
-        dtype=key.dtype,
-    )
-    pad_v = torch.zeros(
-        [batch_size, max_context_len, num_heads, v_head_dim],
-        device="npu",
-        dtype=value.dtype,
-    )
-    num_query = torch.sum(q_mask).item()
-    num_add_query = num_query - query.size(0)
-    # mtp will come in
-    if num_add_query > 0:
-        add_query_size = query.size()
-        add_query_size = list(add_query_size)
-        add_query_size[0] = num_add_query
-        pad_tensor = torch.zeros(add_query_size,
-                                 dtype=query.dtype,
-                                 device=query.device)
-        query = torch.cat([query, pad_tensor], dim=0)
-    pad_q[q_mask] = query
-    pad_k[kv_c_mask] = key[kv_c_mask]
-    pad_v[kv_c_mask] = value[kv_c_mask]
-
-    pad_q = pad_q.permute(0, 2, 1, 3)
-    pad_k = pad_k.permute(0, 2, 1, 3)
-    pad_v = pad_v.permute(0, 2, 1, 3)
-    attn_mask = torch.empty([batch_size, 1, 1, max_context_len],
-                            device="npu").fill_(float("-inf"))
-    attn_mask[:, :, :, :max_context_len].masked_fill_(
-        kv_c_mask[:, None, None, :], 0)
-    # [b, h, f, t]
-    attn_weights = torch.einsum("bhqd,bhkd->bhqk", pad_q, pad_k)
-    attn_weights *= scale
-    attn_mask = attn_mask.float()
-    attn_weights = attn_weights + attn_mask
-    if causal:
-        attn_weights = attn_weights + causal_mask_padding
-
-    attn_weights = torch.softmax(attn_weights, dim=-1)
-    attn_output = torch.einsum("bhqk,bhkd->bhqd", attn_weights, pad_v.float())
-    attn_output = attn_output.permute(0, 2, 1, 3)
-
-    attn_output = (attn_output[q_mask].view([-1, num_heads,
-                                             v_head_dim]).to(output.dtype))
-    attn_output = attn_output.view_as(output)
-    output.copy_(attn_output)
-    return attn_output
-
-
 def vanilla_decode_mla(
         query: torch.Tensor,  # [num_tokens, num_heads, latent_dim + rope_dim]
         key_cache: torch.
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py