[V0.9.1][BugFix] Fix bugs and refactor cached mask generation logic (#2326)

rjg-lyh · web-flow · commit 9dc23b6583a4 · 2025-08-13T17:51:54.000+08:00
### What this PR does / why we need it?
This PR fix bugs and refactor cached mask generation logic. Now just
pre-construct and use the cached mask on cpu instead of device on npu.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
CI passed with new added/existing test.

Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py
@@ -65,7 +65,6 @@ class AttentionMaskBuilder:
     def __init__(self, attn_mask: torch.Tensor):
         self._seq_len_cached = attn_mask.shape[0]
         self.attn_mask_cache = attn_mask
-        self.splitfuse_mask_value = -10000
 
     @classmethod
     def initialize_from_len(cls,
@@ -74,18 +73,25 @@ def initialize_from_len(cls,
                             mask_value: Optional[int] = None):
         return cls(generate_attn_mask(max_seq_len, dtype, mask_value))
 
-    def update_attn_cache(self, seqlen: int, dtype: torch.dtype,
-                          device: torch.device):
-        if seqlen > self._seq_len_cached or self.attn_mask_cache.dtype != dtype:
+    @staticmethod
+    def get_mask_scale_factor(dtype: torch.dtype = torch.float16):
+        mask_scale_factor = 1
+        if dtype == torch.bfloat16:
+            mask_scale_factor = -10000
+        return mask_scale_factor
+
+    def update_attn_cache(self, seqlen: int, dtype: torch.dtype):
+        if seqlen > self._seq_len_cached:
             self._seq_len_cached = seqlen
             self.attn_mask_cache = generate_attn_mask(seqlen, dtype)
-        if self.attn_mask_cache.device != device:
-            self.attn_mask_cache = self.attn_mask_cache.to(device)
+        if self.attn_mask_cache.dtype != dtype:
+            self.attn_mask_cache = self.attn_mask_cache.to(dtype)
 
     def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype,
                       device: torch.device):
-        self.update_attn_cache(max_seq_len, dtype, device)
-        return self.attn_mask_cache[:max_seq_len, :max_seq_len].contiguous()
+        self.update_attn_cache(max_seq_len, dtype)
+        return self.attn_mask_cache[:max_seq_len, :max_seq_len].contiguous(
+        ).to(device)
 
     def get_decode_attn_mask(
         self,
@@ -94,53 +100,28 @@ def get_decode_attn_mask(
         dtype: torch.dtype,
         device: torch.device,
     ):
-        self.update_attn_cache(max_s, dtype, device)
+        self.update_attn_cache(max_s, dtype)
         return (self.attn_mask_cache.index_select(
-            0, input_lengths)[:, :max_s].view(-1, 1, max_s).contiguous())
+            0, input_lengths)[:, :max_s].view(-1, 1,
+                                              max_s).contiguous().to(device))
 
     def get_splitfuse_attn_mask(
         self,
         seq_lens,
-        query_lens,
         position,
         dtype,
         device,
     ) -> torch.Tensor:
         max_seq_len = max(seq_lens, default=0)
-        if max_seq_len <= self._seq_len_cached:
-            self.update_attn_cache(max_seq_len, dtype, device)
-            # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation
-            # is not the same. Fix this in the future when kernel is ready.
-            if self.attn_mask_cache.numel(
-            ) > 1 and self.attn_mask_cache[0][1] > 0:
-                attn_mask = self.get_attn_mask(  # type: ignore
-                    max_seq_len, dtype, device)
-                attn_mask *= -10000
-            else:
-                attn_mask = self.attn_mask_cache
-            return torch.index_select(attn_mask, dim=0,
-                                      index=position)[:, :max_seq_len]
-        total_q_len = sum(query_lens)
-        attn_mask = torch.zeros((total_q_len, max_seq_len),
-                                dtype=dtype,
-                                device="cpu")
-
-        current_row = 0
-        for i in range(len(query_lens)):
-            seq_len = seq_lens[i]
-            q_len = query_lens[i]
-            context_len = seq_len - q_len
-
-            assert context_len >= 0
-            attn_mask[current_row:current_row + q_len,
-                      context_len:] = self.splitfuse_mask_value
-            right_tensor = attn_mask[current_row:current_row + q_len,
-                                     context_len:seq_len]
-            right_tensor.masked_fill_(
-                right_tensor.tril() == self.splitfuse_mask_value, 0)
-            current_row += q_len
-
-        return attn_mask.to(device, non_blocking=True)
+        self.update_attn_cache(max_seq_len, dtype)
+        # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation
+        # is not the same. Fix this in the future when kernel is ready.
+        mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(dtype)
+        attn_mask = torch.index_select(self.attn_mask_cache,
+                                       dim=0,
+                                       index=position)[:, :max_seq_len]
+        attn_mask *= mask_scale_factor
+        return attn_mask.contiguous().to(device, non_blocking=True)
 
 
 class AscendAttentionBackend(AttentionBackend):
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -20,7 +20,6 @@
 import copy
 import gc
 import math
-import os
 import time
 import types
 import weakref
@@ -349,19 +348,10 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
             reversed(
                 self.vllm_config.compilation_config.cudagraph_capture_sizes))
 
-        # NOTE: Pre-construct a mask matrix to improve the efficiency of
+        # NOTE: Pre-construct a mask matrix on cpu to improve the efficiency of
         # attention mask construction during inference.
-        # Note that the length of the matrix needs to be carefully balanced: a
-        # matrix that is too large will consume excessive VRAM, while a matrix
-        # that is too small will require dynamic concatenation during inference,
-        # leading to performance degradation.
-        # Therefore, an environment variable is added here to dynamically set
-        # the size of the pre-constructed mask matrix based on requirements.
-        mask_len = os.getenv("PAGED_ATTENTION_MASK_LEN", 10000)
-        self.attn_mask_len = min(self.model_config.max_model_len,
-                                 int(mask_len))
         self.attn_mask_builder = AttentionMaskBuilder.initialize_from_len(
-            self.attn_mask_len, self.dtype)
+            self.model_config.max_model_len, self.dtype)
 
         self.sampler = Sampler()
         self.new_kv_cache_bytes = -1
@@ -703,12 +693,12 @@ def _check_dbo_is_valid(self, query_lens: torch.Tensor,
     def get_model(self) -> nn.Module:
         return self.model
 
-    def _make_attention_mask(self, seq_lens, query_lens, position,
+    def _make_attention_mask(self, seq_lens, position,
                              attn_state) -> torch.Tensor:
         # Chunk Prefill situation.
         if attn_state == AscendAttentionState.ChunkedPrefill:
             return self.attn_mask_builder.get_splitfuse_attn_mask(
-                seq_lens, query_lens, position, self.dtype, self.device)
+                seq_lens, position, self.dtype, self.device)
         # Prefill without cache situation.
         elif attn_state == AscendAttentionState.PrefillNoCache:
             max_seq_len = max(seq_lens, default=0)
@@ -956,16 +946,17 @@ def _process_reqs(
                 self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
                 non_blocking=True)
 
-        self.positions[total_num_scheduled_tokens:num_input_tokens].zero_()
-        self.positions[:total_num_scheduled_tokens].copy_(
-            self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
+        self.positions_cpu[total_num_scheduled_tokens:num_input_tokens].zero_()
+        self.positions[:num_input_tokens].copy_(
+            self.positions_cpu[:num_input_tokens], non_blocking=True)
+        positions_cpu = self.positions_cpu[:num_input_tokens]
         positions = self.positions[:num_input_tokens]
         self.query_lens = torch.from_numpy(num_scheduled_tokens)
 
         self.seq_lens_np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] +
             num_scheduled_tokens)
-        seq_lens = self.seq_lens_cpu[:num_reqs]
+        seq_lens_cpu = self.seq_lens_cpu[:num_reqs]
 
         block_table_indices = (req_indices * self.max_num_blocks_per_req +
                                positions_np // self.block_size)
@@ -999,11 +990,9 @@ def _process_reqs(
 
         # NOTE: when use ring_mla, attn_mask don't need to generate here.
         if not self.vllm_config.model_config.use_mla:
-            attn_mask = self._make_attention_mask(
-                seq_lens=seq_lens,
-                query_lens=num_scheduled_tokens,
-                position=positions,
-                attn_state=attn_state)
+            attn_mask = self._make_attention_mask(seq_lens=seq_lens_cpu,
+                                                  position=positions_cpu,
+                                                  attn_state=attn_state)
             self.attn_mask = attn_mask
         self.attn_state = attn_state  # type: ignore