Skip to content

Commit 292fb8f

Browse files
authored
[1/N][Refactor] torchair model runner refactor (#2205)
There is lot of torchair code in model runner leading the code hard for maintenance. We'll create new torchair_model_runner to split torchair related logic. Following the workflow #2203, this is the first PR. What this PR does: create the new torchair model runner, more function will be added later - vLLM version: v0.10.0 - vLLM main: vllm-project/vllm@586f286 Signed-off-by: wangxiyuan <[email protected]>
1 parent 458ab2d commit 292fb8f

File tree

4 files changed

+50
-9
lines changed

4 files changed

+50
-9
lines changed

.github/workflows/vllm_ascend_test.yaml

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,13 @@ jobs:
196196
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
197197
pytest -sv tests/e2e/singlecard/test_camem.py
198198
pytest -sv tests/e2e/singlecard/test_embedding.py
199+
200+
# ------------------------------------ v1 spec decode test ------------------------------------ #
201+
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
202+
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
203+
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
204+
205+
# All other tests, ignore: 310p test, accuracy test.
199206
pytest -sv tests/e2e/singlecard/ \
200207
--ignore=tests/e2e/singlecard/test_offline_inference.py \
201208
--ignore=tests/e2e/singlecard/test_ilama_lora.py \
@@ -204,13 +211,9 @@ jobs:
204211
--ignore=tests/e2e/singlecard/test_embedding.py \
205212
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py \
206213
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \
207-
--ignore=tests/e2e/singlecard/test_offline_inference_310p.py
208-
# ------------------------------------ v1 spec decode test ------------------------------------ #
209-
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
210-
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
211-
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
212-
213-
e2e-4-cards:
214+
--ignore=tests/e2e/singlecard/test_offline_inference_310p.py \
215+
--ignore=tests/e2e/singlecard/models/test_lm_eval_correctness.py
216+
e2e-2-cards:
214217
needs: [e2e]
215218
if: ${{ needs.e2e.result == 'success' }}
216219
strategy:
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# Copyright 2023 The vLLM team.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
# This file is a part of the vllm-ascend project.
17+
# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
18+
#
19+
20+
import torch
21+
from vllm.config import VllmConfig
22+
23+
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
24+
25+
26+
class NPUTorchairModelRunner(NPUModelRunner):
27+
28+
def __init__(self, vllm_config: VllmConfig, device: torch.device):
29+
super().__init__(vllm_config, device)

vllm_ascend/torchair/torchair_worker.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from vllm.logger import logger
1818

1919
import vllm_ascend.envs as envs_ascend
20+
from vllm_ascend.torchair.torchair_model_runner import NPUTorchairModelRunner
2021
from vllm_ascend.torchair.utils import (check_kv_cache_bytes_cache_exist,
2122
check_torchair_cache_exist,
2223
delete_torchair_cache_file,
@@ -52,3 +53,9 @@ def determine_available_memory(self) -> int:
5253
self.model_runner.new_kv_cache_bytes = available_kv_cache_memory
5354

5455
return available_kv_cache_memory
56+
57+
def init_device(self):
58+
"""Override init_device to init torchair model runner"""
59+
device = self._init_device()
60+
# Init ModelRunner here, so that we have access to self.device.
61+
self.model_runner = NPUTorchairModelRunner(self.vllm_config, device)

vllm_ascend/worker/worker_v1.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,17 +130,19 @@ def initialize_cache(self, num_gpu_blocks: int,
130130
self.cache_config.num_gpu_blocks = num_gpu_blocks
131131
self.cache_config.num_cpu_blocks = num_cpu_blocks
132132

133-
def init_device(self):
133+
def _init_device(self):
134134
device = torch.device(f"npu:{self.local_rank}")
135135
NPUPlatform.set_device(device)
136136
NPUPlatform.empty_cache()
137137
self.init_npu_memory = NPUPlatform.mem_get_info()[0]
138-
139138
# Initialize the distributed environment.
140139
self._init_worker_distributed_environment()
141140
# Set random seed.
142141
NPUPlatform.seed_everything(self.model_config.seed)
142+
return device
143143

144+
def init_device(self):
145+
device = self._init_device()
144146
# Init ModelRunner here, so that we have access to self.device.
145147
self.model_runner = NPUModelRunner(self.vllm_config, device)
146148

0 commit comments

Comments
 (0)