vllm-project
diff --git a/‎vllm_ascend/attention/attention_v1.py
Lines changed: 90 additions & 22 deletions b/‎vllm_ascend/attention/attention_v1.py
Lines changed: 90 additions & 22 deletions
@@ -119,7 +119,6 @@ class AscendAttentionState(Enum):
 
 @dataclass
 class AscendMetadata:
-
     # **************************** Basic Properties ************************** #
     attn_mask: Optional[torch.Tensor] = None
     # Current state of this attention run.
@@ -155,37 +154,106 @@ class AscendMetadata:
     is_only_prefill: bool = False
 
 
+@dataclass
+class AscendAttentionMetadataBuildInfo:
+    block_tables: torch.Tensor = None
+    query_start_loc: torch.Tensor = None
+    query_lens: torch.Tensor = None
+    seq_lens: torch.Tensor = None
+    slot_mapping: torch.Tensor = None
+    attn_mask: torch.Tensor = None
+    attn_state: AscendAttentionState = None
+
+
 class AscendAttentionMetadataBuilder:
 
     def __init__(self, runner):
         self.runner = runner
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
+    def reorder_batch(
+        self,
+        input_batch: "InputBatch",
+        scheduler_output: "SchedulerOutput",
+    ) -> bool:
         return False
 
-    def build(self,
-              num_reqs,
-              num_actual_tokens,
-              max_query_len,
-              enable_dbo_across_dp: bool = False,
-              is_only_prefill: bool = False):
+    def _assemble_build_info(
+        self,
+        num_reqs,
+        num_actual_tokens,
+        max_query_len,
+        block_tables,
+        query_start_loc,
+        query_lens,
+        seq_lens,
+        slot_mapping,
+        attn_mask,
+        attn_state: "AscendAttentionState",
+        *args,
+        **kwargs,
+    ) -> "AscendAttentionMetadataBuildInfo":
+        build_info = AscendAttentionMetadataBuildInfo(
+            block_tables=block_tables,
+            query_start_loc=query_start_loc,
+            query_lens=query_lens,
+            seq_lens=seq_lens,
+            slot_mapping=slot_mapping,
+            attn_mask=attn_mask,
+            attn_state=attn_state)
+        return build_info
+
+    def _prepare_build_info(
+        self,
+        num_reqs,
+        num_actual_tokens,
+        max_query_len,
+        enable_dbo_across_dp,
+        is_only_prefill,
+        *args,
+        **kwargs,
+    ) -> "AscendAttentionMetadataBuildInfo":
+        device = self.runner.device
+
+        block_tables = self.runner.input_batch.block_table[
+            0].get_device_tensor()
+        block_tables[:num_reqs, :self.runner.max_num_blocks_per_req] = (
+            block_tables[:num_reqs])
 
-        block_table = self.runner.input_batch.block_table[0].get_device_tensor(
-        )
-        block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
-            block_table[:num_reqs])
+        query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
+        query_start_loc = query_start_loc_cpu.to(device, non_blocking=True)
 
         query_lens = self.runner.query_lens
         seq_lens = self.runner.seq_lens_cpu[:num_reqs]
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
-            self.runner.device, non_blocking=True)
+            device, non_blocking=True)
         attn_mask = self.runner.attn_mask
         attn_state = self.runner.attn_state
-        query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
-        query_start_loc = query_start_loc_cpu.to(self.runner.device,
-                                                 non_blocking=True)
 
+        build_info = self._assemble_build_info(num_reqs, num_actual_tokens,
+                                               max_query_len, block_tables,
+                                               query_start_loc, query_lens,
+                                               seq_lens, slot_mapping,
+                                               attn_mask, attn_state, args,
+                                               kwargs)
+        return build_info
+
+    def build(
+        self,
+        num_reqs,
+        num_actual_tokens,
+        max_query_len,
+        enable_dbo_across_dp: bool = False,
+        is_only_prefill: bool = False,
+        *args,
+        **kwargs,
+    ):
+        build_info = self._prepare_build_info(num_reqs, num_actual_tokens,
+                                              max_query_len,
+                                              enable_dbo_across_dp,
+                                              is_only_prefill, args, kwargs)
+
+        attn_mask = build_info.attn_mask
+        attn_state = build_info.attn_state
         if is_310p():
             if attn_state == AscendAttentionState.PrefillNoCache:
                 mask_nz = nd_to_nz_2d(attn_mask)
@@ -198,12 +266,12 @@ def build(self,
 
         attn_metadata = AscendMetadata(
             num_actual_tokens=num_actual_tokens,
-            block_tables=block_table,
-            query_start_loc=query_start_loc,
-            query_lens=query_lens,
-            seq_lens=seq_lens,
+            block_tables=build_info.block_tables,
+            query_start_loc=build_info.query_start_loc,
+            query_lens=build_info.query_lens,
+            seq_lens=build_info.seq_lens,
             max_query_len=max_query_len,
-            slot_mapping=slot_mapping,
+            slot_mapping=build_info.slot_mapping,
             attn_mask=attn_mask,
             attn_state=attn_state,
             enable_dbo_across_dp=enable_dbo_across_dp,