refact attn metadata build

weiguihua2 · weiguihua2 · commit ef3b644d86a3 · 2025-08-17T21:53:31.000+08:00
Signed-off-by: weiguihua2 &lt;weiguihua2@huawei.com&gt;
diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py
@@ -106,7 +106,7 @@ def split_decodes_and_prefills(
         return num_reqs, 0, num_tokens, 0
 
     first_prefill = is_prefill.int().argmax(dim=-1).item()
-    assert torch.all(query_lens[first_prefill:] > decode_threshold)
+    assert torch.all(query_lens[first_prefill:] >= decode_threshold)
     assert torch.all(query_lens[:first_prefill] <= decode_threshold)
     num_decodes = first_prefill
     num_prefills = num_reqs - num_decodes
diff --git a/vllm_ascend/worker/eagle_proposer_v1.py b/vllm_ascend/worker/eagle_proposer_v1.py
@@ -128,7 +128,7 @@ def propose(
 
         common_attn_metadata = AscendCommonAttentionMetadata(
             query_start_loc=self.runner.query_start_loc[:batch_size + 1],
-            query_start_loc_cpu=self.query_start_loc_cpu[:batch_size + 1],
+            query_start_loc_cpu=self.runner.query_start_loc_cpu[:batch_size + 1],
             seq_lens_cpu=self.runner.seq_lens_cpu,
             max_query_len=max_query_len,
             num_reqs=batch_size,
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -815,7 +815,7 @@ def get_eagle_atten_dict(
                 max_num_blocks_per_req=self.max_num_blocks_per_req,
                 decode_token_per_req=self.decode_token_per_req,
             )
-            attn_metadata_i = self.attn_metadata_builder.build(common_attn_metadata, self.model)
+            attn_metadata_i = self.attn_metadata_builder.build(common_attn_metadata, self.get_model())
             for layer_name in kv_cache_group_spec.layer_names:
                 attn_metadata[layer_name] = attn_metadata_i
 
diff --git a/vllm_ascend/worker/mtp_proposer_v1.py b/vllm_ascend/worker/mtp_proposer_v1.py
@@ -168,9 +168,9 @@ def propose(
             num_input_tokens = num_tokens
 
         common_attn_metadata = AscendCommonAttentionMetadata(
-            query_start_loc=self.runner.query_start_loc[:batch_size + 1],
-            query_start_loc_cpu=self.runner.query_start_loc_cpu[:batch_size + 1],
-            seq_lens_cpu=target_positions.cpu()[last_token_indices] + 1,
+            query_start_loc=cu_num_tokens[:batch_size + 1],
+            query_start_loc_cpu=cu_num_tokens[:batch_size + 1].cpu(),
+            seq_lens_cpu=seq_lens.cpu(),
             num_reqs=batch_size,
             num_actual_tokens=num_tokens,
             max_query_len=max_query_len,
@@ -184,7 +184,7 @@ def propose(
             graph_pad_size=extra_builder_kwargs['graph_pad_size'],
             decode_token_per_req=self.runner.decode_token_per_req,
         )
-        attn_metadata = self.runner.attn_metadata_builder.build(common_attn_metadata, self.runner.model)
+        attn_metadata = self.runner.attn_metadata_builder.build(common_attn_metadata, self.runner.get_model())
 
         self.positions[:num_tokens] = target_positions
         self.hidden_states[:num_tokens] = target_hidden_states

Original file line number	Diff line number	Diff line change
`@@ -815,7 +815,7 @@ def get_eagle_atten_dict(`
`815`	`815`	`max_num_blocks_per_req=self.max_num_blocks_per_req,`
`816`	`816`	`decode_token_per_req=self.decode_token_per_req,`
`817`	`817`	`)`
`818`		`- attn_metadata_i = self.attn_metadata_builder.build(common_attn_metadata, self.model)`
	`818`	`+ attn_metadata_i = self.attn_metadata_builder.build(common_attn_metadata, self.get_model())`
`819`	`819`	`for layer_name in kv_cache_group_spec.layer_names:`
`820`	`820`	`attn_metadata[layer_name] = attn_metadata_i`
`821`	`821`