refact model runner

weiguihua2 · weiguihua2 · commit d4d9ee24df53 · 2025-08-19T15:25:03.000+08:00
Signed-off-by: weiguihua2 &lt;weiguihua2@huawei.com&gt;
diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py
@@ -53,32 +53,6 @@ class AscendCommonAttentionMetadata:
     graph_pad_size: int = -1
 
 
-@dataclass
-class TorchairCommonAttentionMetadata:
-    """
-    Per-batch attention metadata, shared across layers and backends.
-    AttentionMetadataBuilder instances use it to construct per-layer metadata.
-    
-    For many of the tensors we keep both GPU and CPU versions.
-    """
-
-    num_reqs: int
-    """Number of requests"""
-
-    num_actual_tokens: int
-    """Total number of tokens in batch"""
-
-    decode_token_per_req: int
-
-    actual_seq_lengths_q: list[int]
-
-    attn_mask: torch.Tensor = None
-
-    spec_attn_mask: torch.Tensor = None
-
-    graph_pad_size: int = -1
-
-
 def split_decodes_and_prefills(
     common_attn_metadata: AscendCommonAttentionMetadata,
     decode_threshold: int = 1,