refact attn metadata build

weiguihua2 · weiguihua2 · commit c416bbf15c03 · 2025-08-14T19:37:55.000+08:00
Signed-off-by: weiguihua2 &lt;weiguihua2@huawei.com&gt;
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -27,7 +27,7 @@
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.forward_context import ForwardContext, get_forward_context
-from vllm.utils import direct_register_custom_op
+from vllm.utils import direct_register_custom_op, cdiv
 from vllm.v1.core.sched.output import SchedulerOutput
 
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
@@ -165,6 +165,8 @@ def __init__(self,
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.device = device
+        self.max_num_blocks_per_req = cdiv(self.model_config.max_model_len,
+                                           vllm_config.cache_config.block_size)
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
@@ -178,7 +180,7 @@ def build(self,
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu[:num_reqs + 1]
 
         block_table = common_attn_metadata.block_table_tensor
-        block_table[:num_reqs, :common_attn_metadata.max_num_blocks_per_req] = (
+        block_table[:num_reqs, :self.max_num_blocks_per_req] = (
             block_table[:num_reqs])
 
         query_lens = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
diff --git a/vllm_ascend/attention/attention_v1_torchair.py b/vllm_ascend/attention/attention_v1_torchair.py
@@ -23,6 +23,7 @@
 import torch_npu
 import torch.nn as nn
 from vllm.config import VllmConfig
+from vllm.utils import cdiv
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import PAD_SLOT_ID, CommonAttentionState
@@ -32,7 +33,7 @@
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
                                nd_to_nz_2d)
 from vllm_ascend.worker.npu_input_batch import InputBatch
-from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
+from vllm_ascend.attention.utils import AscendCommonAttentionMetadata, get_decode_token_per_req
 
 
 class AscendAttentionTorchairBackend(AttentionBackend):
@@ -154,6 +155,9 @@ def __init__(self,
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.device = device
+        self.max_num_blocks_per_req = cdiv(self.model_config.max_model_len,
+                                           vllm_config.cache_config.block_size)
+        self.decode_token_per_req = get_decode_token_per_req(vllm_config.speculative_config)
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
@@ -214,7 +218,7 @@ def build(self,
         num_actual_tokens = common_attn_metadata.num_actual_tokens
 
         block_table = common_attn_metadata.block_table_tensor
-        block_table[:num_reqs, :common_attn_metadata.max_num_blocks_per_req] = (
+        block_table[:num_reqs, :self.max_num_blocks_per_req] = (
             block_table[:num_reqs])
 
         seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs]
@@ -253,7 +257,7 @@ def build(self,
                     pad_value = 0
                     num_token_pad_size = graph_pad_size - num_actual_tokens
                     num_reqs_pad_size = (
-                        graph_pad_size // common_attn_metadata.decode_token_per_req -
+                        graph_pad_size // self.decode_token_per_req -
                         num_reqs)
                 pad_value = 1
                 padded_seq_lens = seq_lens.tolist() + [pad_value
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -25,7 +25,7 @@
 from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
 from vllm_ascend.utils import npu_prefetch
 from vllm_ascend.worker.npu_input_batch import InputBatch
-from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,split_decodes_and_prefills)
+from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,split_decodes_and_prefills, get_decode_token_per_req)
 
 
 if TYPE_CHECKING:
@@ -186,6 +186,7 @@ def __init__(self,
         scheduler_config = vllm_config.scheduler_config
         self.block_size = vllm_config.cache_config.block_size
         self.max_blocks = (vllm_config.model_config.max_model_len + self.block_size - 1) // self.block_size
+        self.decode_token_per_req = get_decode_token_per_req(vllm_config.speculative_config)
         self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
         if self.chunked_prefill_enabled:
             self.chunked_prefill_workspace_size = min(
@@ -293,7 +294,7 @@ def build_torchair_graph_dummy(
                                   device=device)
         block_table = self._get_graph_runner_block_tables(
             num_reqs, block_table)
-        num_tokens = num_reqs * common_attn_metadata.decode_token_per_req
+        num_tokens = num_reqs * self.decode_token_per_req
         seq_lens = torch.zeros(num_reqs, dtype=torch.int32, device=device)
         seq_lens_list = [0] * num_reqs
         input_positions = torch.zeros(num_tokens,
diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 
+from vllm.config import SpeculativeConfig
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 
 import torch
@@ -14,26 +15,23 @@ class AscendCommonAttentionMetadata:
     For many of the tensors we keep both GPU and CPU versions.
     """
 
-    query_start_loc: torch.Tensor = None
-    query_start_loc_cpu: torch.Tensor = None
+    query_start_loc: torch.Tensor
+    query_start_loc_cpu: torch.Tensor
     """(batch_size + 1,), the start location of each request in query Tensor"""
 
-    seq_lens: torch.Tensor = None
-    seq_lens_cpu: torch.Tensor = None
+    seq_lens_cpu: torch.Tensor
     """(batch_size,), the length of each request including both computed tokens
     and newly scheduled tokens"""
 
     num_reqs: int
     """Number of requests"""
     num_actual_tokens: int
     """Total number of tokens in batch"""
-    max_query_len: int
-    """Longest query in batch"""
 
     actual_seq_lengths_q: list[int] = None
 
-    block_table_tensor: torch.Tensor = None
-    slot_mapping_cpu: torch.Tensor = None
+    block_table_tensor: torch.Tensor
+    slot_mapping_cpu: torch.Tensor
 
     positions: torch.Tensor = None
 
@@ -47,7 +45,7 @@ class AscendCommonAttentionMetadata:
 
     enable_dbo_across_dp: bool = False
 
-    is_only_prefill: bool
+    is_only_prefill: bool = False
 
     graph_pad_size: int = -1
 
@@ -70,8 +68,6 @@ class TorchairCommonAttentionMetadata:
     attn_mask: torch.Tensor = None
     spec_attn_mask: torch.Tensor = None
 
-    decode_token_per_req: int
-
     graph_pad_size: int = -1
 
 
@@ -115,3 +111,12 @@ def split_decodes_and_prefills(
     num_decode_tokens = query_start_loc[first_prefill].item()
     num_prefill_tokens = num_tokens - num_decode_tokens
     return (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens)
+
+
+def get_decode_token_per_req(speculative_config: SpeculativeConfig):
+    decode_token_per_req = 1
+    if not speculative_config:
+        return decode_token_per_req
+    spec_token_num = speculative_config.num_speculative_tokens
+    assert spec_token_num > 0
+    return decode_token_per_req + spec_token_num
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -92,7 +92,7 @@
 from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
-from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
+from vllm_ascend.attention.utils import AscendCommonAttentionMetadata, get_decode_token_per_req
 
 if not vllm_version_is("0.10.0"):
     from vllm.tasks import GenerationTask, SupportedTask
@@ -221,7 +221,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
             use_mla=self.model_config.use_mla,
         )
         self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
-            weakref.proxy(self))
+            vllm_config, device)
         self.attn_mask_builder = AttentionMaskBuilder(
             min(self.model_config.max_model_len,
                 int(os.getenv("PAGED_ATTENTION_MASK_LEN", 10000))), self.dtype)
@@ -234,13 +234,9 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.drafter: Optional[Union[NgramProposer, EagleProposer,
                                      MtpProposer]] = None
         self.actual_seq_lengths_q = []
-        self.spec_token_num = 0
-        self.decode_token_per_req = 1
+        self.decode_token_per_req = get_decode_token_per_req(self.speculative_config)
         if self.speculative_config:
             self.use_spec_decode = True
-            self.spec_token_num = self.speculative_config.num_speculative_tokens
-            assert self.spec_token_num > 0
-            self.decode_token_per_req = 1 + self.spec_token_num
             self.actual_seq_lengths_q = [
                 len for len in
                 range(self.decode_token_per_req, self.max_num_tokens +