refact attn metadata build

weiguihua2 · weiguihua2 · commit be66ea7b2216 · 2025-08-14T20:13:16.000+08:00
Signed-off-by: weiguihua2 &lt;weiguihua2@huawei.com&gt;
diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py
@@ -39,10 +39,8 @@ class AscendCommonAttentionMetadata:
     spec_attn_mask: torch.Tensor = None
     attn_state: AscendAttentionState = None
 
-    decode_token_per_req: int
-
-    max_num_blocks_per_req: int
-
+    max_query_len: int
+    
     enable_dbo_across_dp: bool = False
 
     is_only_prefill: bool = False
diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py
@@ -77,7 +77,6 @@ def _build_attention_metadata(self, with_prefill, num_reqs, skip_attn):
                 attn_mask=self.attn_mask,
                 spec_attn_mask=self.spec_attn_mask,
                 attn_state=self.attn_state,
-                decode_token_per_req=self.decode_token_per_req,
             )
             attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy(common_attn_metadata)
         else:
diff --git a/vllm_ascend/worker/eagle_proposer_v1.py b/vllm_ascend/worker/eagle_proposer_v1.py
@@ -129,20 +129,17 @@ def propose(
         common_attn_metadata = AscendCommonAttentionMetadata(
             query_start_loc=self.runner.query_start_loc[:batch_size + 1],
             query_start_loc_cpu=self.query_start_loc_cpu[:batch_size + 1],
-            seq_lens=self.runner.seq_lens,
             seq_lens_cpu=self.runner.seq_lens_cpu,
+            max_query_len=max_query_len,
             num_reqs=batch_size,
             num_actual_tokens=num_tokens,
-            max_query_len=max_query_len,
             actual_seq_lengths_q=self.runner.actual_seq_lengths_q,
             block_table_tensor=self.runner.input_batch.block_table[0].get_device_tensor(),
-            slot_mapping_cpu=self.runner.slot_mapping_cpu,
-            positions=self.positions,
+            slot_mapping_cpu=target_slot_mapping,
+            positions=target_positions,
             attn_mask=self.runner.attn_mask,
             spec_attn_mask=self.runner.spec_attn_mask,
             attn_state=self.runner.attn_state,
-            decode_token_per_req=self.runner.decode_token_per_req,
-            max_num_blocks_per_req=self.runner.max_num_blocks_per_req,
         )
         # FIXME(woosuk): The below two ops cause synchronization. Optimize.
         attn_metadata = self.runner.attn_metadata_builder.build(common_attn_metadata)
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -801,19 +801,17 @@ def get_eagle_atten_dict(
             common_attn_metadata = AscendCommonAttentionMetadata(
                 query_start_loc=self.query_start_loc[:num_reqs + 1],
                 query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
-                seq_lens=self.seq_lens,
                 seq_lens_cpu=self.seq_lens_cpu,
                 num_reqs=num_reqs,
-                num_actual_tokens=total_num_scheduled_tokens,
                 max_query_len=max_num_scheduled_tokens,
+                num_actual_tokens=total_num_scheduled_tokens,
                 actual_seq_lengths_q=self.actual_seq_lengths_q,
                 block_table_tensor=self.input_batch.block_table[0].get_device_tensor(),
                 slot_mapping_cpu=self.slot_mapping_cpu,
                 positions=self.positions,
                 attn_mask=self.attn_mask,
                 spec_attn_mask=self.spec_attn_mask,
                 attn_state=self.attn_state,
-                decode_token_per_req=self.decode_token_per_req,
                 max_num_blocks_per_req=self.max_num_blocks_per_req,
             )
             attn_metadata_i = self.attn_metadata_builder.build(common_attn_metadata)
@@ -1223,20 +1221,16 @@ def _process_reqs(
         common_attn_metadata = AscendCommonAttentionMetadata(
             query_start_loc=self.query_start_loc[:num_reqs + 1],
             query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
-            seq_lens=self.seq_lens,
             seq_lens_cpu=self.seq_lens_cpu,
             num_reqs=num_reqs,
             num_actual_tokens=total_num_scheduled_tokens,
-            max_query_len=max_num_scheduled_tokens,
             actual_seq_lengths_q=self.actual_seq_lengths_q,
             block_table_tensor=self.input_batch.block_table[0].get_device_tensor(),
             slot_mapping_cpu=self.slot_mapping_cpu,
             positions=self.positions,
             attn_mask=self.attn_mask,
             spec_attn_mask=self.spec_attn_mask,
             attn_state=self.attn_state,
-            decode_token_per_req=self.decode_token_per_req,
-            max_num_blocks_per_req=self.max_num_blocks_per_req,
             enable_dbo_across_dp=enable_dbo,
             is_only_prefill=is_only_prefill,
             graph_pad_size=self.graph_pad_size
diff --git a/vllm_ascend/worker/mtp_proposer_v1.py b/vllm_ascend/worker/mtp_proposer_v1.py
@@ -170,7 +170,6 @@ def propose(
         common_attn_metadata = AscendCommonAttentionMetadata(
             query_start_loc=self.runner.query_start_loc[:batch_size + 1],
             query_start_loc_cpu=self.runner.query_start_loc_cpu[:batch_size + 1],
-            seq_lens=target_positions[last_token_indices] + 1,
             seq_lens_cpu=target_positions.cpu()[last_token_indices] + 1,
             num_reqs=batch_size,
             num_actual_tokens=num_tokens,
@@ -182,8 +181,6 @@ def propose(
             attn_mask=self.runner.attn_mask,
             spec_attn_mask=self.runner.spec_attn_mask,
             attn_state=self.runner.attn_state,
-            decode_token_per_req=self.runner.decode_token_per_req,
-            max_num_blocks_per_req=self.runner.max_num_blocks_per_req,
             graph_pad_size=extra_builder_kwargs['graph_pad_size']
         )
         attn_metadata = self.runner.attn_metadata_builder.build(common_attn_metadata)
@@ -302,7 +299,6 @@ def dummy_run(self,
             actual_seq_lengths_q=self.runner.actual_seq_lengths_q,
             attn_mask=self.runner.attn_mask,
             spec_attn_mask=self.runner.spec_attn_mask,
-            decode_token_per_req=self.runner.decode_token_per_req,
         )
             attn_metadata = self.runner.attn_metadata_builder.build_torchair_graph_dummy(common_attn_metadata)
 

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,6 @@ def _build_attention_metadata(self, with_prefill, num_reqs, skip_attn):`
`77`	`77`	`attn_mask=self.attn_mask,`
`78`	`78`	`spec_attn_mask=self.spec_attn_mask,`
`79`	`79`	`attn_state=self.attn_state,`
`80`		`- decode_token_per_req=self.decode_token_per_req,`
`81`	`80`	`)`
`82`	`81`	`attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy(common_attn_metadata)`
`83`	`82`	`else:`