refact model runner

weiguihua2 · weiguihua2 · commit 3cca936fd397 · 2025-08-19T15:22:09.000+08:00
Signed-off-by: weiguihua2 &lt;weiguihua2@huawei.com&gt;
diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py
@@ -11,7 +11,7 @@
                                           AscendMLAImpl, AscendMLAMetadata,
                                           AscendMLAMetadataBuilder,
                                           AscendMLAPrefillMetadata)
-from vllm_ascend.attention.utils import TorchairCommonAttentionMetadata
+from vllm_ascend.torchair.utils import TorchairCommonAttentionMetadata
 
 
 class TestAscendMLABackend(TestBase):
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -19,13 +19,13 @@
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
-                                         TorchairCommonAttentionMetadata,
                                          split_decodes_and_prefills)
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
-from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
+from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
+                                        npu_stream_switch, npu_wait_tensor)
 from vllm_ascend.utils import npu_prefetch
 from vllm_ascend.worker.npu_input_batch import InputBatch
 
diff --git a/vllm_ascend/torchair/torchair_attention.py b/vllm_ascend/torchair/torchair_attention.py
@@ -22,28 +22,19 @@
 import torch
 import torch.nn as nn
 import torch_npu
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer, AttentionType)
-from vllm.attention.backends.utils import PAD_SLOT_ID, CommonAttentionState
-from vllm.config import VllmConfig
-from vllm.utils import cdiv
-from vllm.v1.core.sched.output import SchedulerOutput
-
-from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer,
                                               AttentionType)
 from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.utils import cdiv
 
 from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
                                                 AscendAttentionMetadataBuilder,
                                                 AscendAttentionState,
                                                 AscendMetadata)
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
-from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
-                                         TorchairCommonAttentionMetadata)
+from vllm_ascend.torchair.utils import TorchairCommonAttentionMetadata
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
                                nd_to_nz_2d)
-from vllm_ascend.worker.npu_input_batch import InputBatch
 
 
 class AscendAttentionTorchairBackend(AscendAttentionBackend):
@@ -108,9 +99,12 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):
 
     def __init__(self, runner):
         super().__init__(runner)
-        self.max_num_blocks_per_req = cdiv(self.model_config.max_model_len,
-                                           vllm_config.cache_config.block_size)
-        self.max_blocks = (self.model_config.max_model_len + vllm_config.cache_config.block_size - 1) // vllm_config.cache_config.block_size
+        self.max_num_blocks_per_req = cdiv(
+            self.model_config.max_model_len,
+            self.vllm_config.cache_config.block_size)
+        self.max_blocks = (self.model_config.max_model_len +
+                           self.vllm_config.cache_config.block_size -
+                           1) // self.vllm_config.cache_config.block_size
 
     def _get_graph_runner_block_tables(
             self, num_seqs: int, block_tables: torch.Tensor) -> torch.Tensor:
diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py
@@ -25,9 +25,9 @@
 from vllm.forward_context import get_forward_context
 from vllm.logger import logger
 
-from vllm_ascend.attention.utils import TorchairCommonAttentionMetadata
 from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
+from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
+                                        check_torchair_cache_exist,
                                         register_torchair_model,
                                         write_kv_cache_bytes_to_file)
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
diff --git a/vllm_ascend/torchair/utils.py b/vllm_ascend/torchair/utils.py
@@ -2,6 +2,7 @@
 import os
 import shutil
 from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
 
 import torch
 
@@ -20,6 +21,32 @@
     'TORCHAIR_CACHE_HOME', os.path.join(os.getcwd(), TORCHAIR_CACHE_PATH_NAME))
 
 
+@dataclass
+class TorchairCommonAttentionMetadata:
+    """
+    Per-batch attention metadata, shared across layers and backends.
+    AttentionMetadataBuilder instances use it to construct per-layer metadata.
+    
+    For many of the tensors we keep both GPU and CPU versions.
+    """
+
+    num_reqs: int
+    """Number of requests"""
+
+    num_actual_tokens: int
+    """Total number of tokens in batch"""
+
+    decode_token_per_req: int
+
+    actual_seq_lengths_q: list[int]
+
+    attn_mask: torch.Tensor = None
+
+    spec_attn_mask: torch.Tensor = None
+
+    graph_pad_size: int = -1
+
+
 @contextmanager
 def _file_lock(file_descriptor, lock_type):
     fcntl.flock(file_descriptor, lock_type)
diff --git a/vllm_ascend/worker/mtp_proposer_v1.py b/vllm_ascend/worker/mtp_proposer_v1.py
@@ -16,9 +16,9 @@
 
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import set_ascend_forward_context
-from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
-                                         TorchairCommonAttentionMetadata)
+from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.models.deepseek_mtp import CustomDeepSeekMTP
+from vllm_ascend.torchair.utils import TorchairCommonAttentionMetadata
 from vllm_ascend.utils import ProfileExecuteDuration