[1/N][Refactor] torchair model runner refactor (#2205)

wangxiyuan · web-flow · commit 292fb8f69601 · 2025-08-05T18:43:04.000+08:00
There is lot of torchair code in model runner leading the code hard for maintenance. We'll create new torchair_model_runner to split torchair related logic. Following the workflow #2203, this is the first PR. What this PR does: create the new torchair model runner, more function will be added later - vLLM version: v0.10.0 - vLLM main: vllm-project/vllm@586f286 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -196,6 +196,13 @@ jobs:
           pytest -sv tests/e2e/singlecard/test_guided_decoding.py
           pytest -sv tests/e2e/singlecard/test_camem.py
           pytest -sv tests/e2e/singlecard/test_embedding.py
+
+          # ------------------------------------ v1 spec decode test ------------------------------------ #
+          pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+          # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
+          pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+
+          # All other tests, ignore: 310p test, accuracy test.
           pytest -sv tests/e2e/singlecard/ \
           --ignore=tests/e2e/singlecard/test_offline_inference.py \
           --ignore=tests/e2e/singlecard/test_ilama_lora.py \
@@ -204,13 +211,9 @@ jobs:
           --ignore=tests/e2e/singlecard/test_embedding.py \
           --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py \
           --ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \
-          --ignore=tests/e2e/singlecard/test_offline_inference_310p.py
-          # ------------------------------------ v1 spec decode test ------------------------------------ #
-          VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
-          # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
-          VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
-
-  e2e-4-cards:
+          --ignore=tests/e2e/singlecard/test_offline_inference_310p.py \
+          --ignore=tests/e2e/singlecard/models/test_lm_eval_correctness.py
+  e2e-2-cards:
     needs: [e2e]
     if: ${{ needs.e2e.result == 'success' }}
     strategy:
diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
+#
+
+import torch
+from vllm.config import VllmConfig
+
+from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
+
+
+class NPUTorchairModelRunner(NPUModelRunner):
+
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        super().__init__(vllm_config, device)
diff --git a/vllm_ascend/torchair/torchair_worker.py b/vllm_ascend/torchair/torchair_worker.py
@@ -17,6 +17,7 @@
 from vllm.logger import logger
 
 import vllm_ascend.envs as envs_ascend
+from vllm_ascend.torchair.torchair_model_runner import NPUTorchairModelRunner
 from vllm_ascend.torchair.utils import (check_kv_cache_bytes_cache_exist,
                                         check_torchair_cache_exist,
                                         delete_torchair_cache_file,
@@ -52,3 +53,9 @@ def determine_available_memory(self) -> int:
         self.model_runner.new_kv_cache_bytes = available_kv_cache_memory
 
         return available_kv_cache_memory
+
+    def init_device(self):
+        """Override init_device to init torchair model runner"""
+        device = self._init_device()
+        # Init ModelRunner here, so that we have access to self.device.
+        self.model_runner = NPUTorchairModelRunner(self.vllm_config, device)
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -130,17 +130,19 @@ def initialize_cache(self, num_gpu_blocks: int,
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
-    def init_device(self):
+    def _init_device(self):
         device = torch.device(f"npu:{self.local_rank}")
         NPUPlatform.set_device(device)
         NPUPlatform.empty_cache()
         self.init_npu_memory = NPUPlatform.mem_get_info()[0]
-
         # Initialize the distributed environment.
         self._init_worker_distributed_environment()
         # Set random seed.
         NPUPlatform.seed_everything(self.model_config.seed)
+        return device
 
+    def init_device(self):
+        device = self._init_device()
         # Init ModelRunner here, so that we have access to self.device.
         self.model_runner = NPUModelRunner(self.vllm_config, device)