Skip to content

Commit 668d7db

Browse files
Merge branch 'vllm-project:main' into main
2 parents b9b3b00 + 61866b8 commit 668d7db

25 files changed

+170
-317
lines changed

.github/workflows/vllm_ascend_test.yaml

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ jobs:
8181
VLLM_USE_MODELSCOPE: True
8282
strategy:
8383
matrix:
84-
vllm_version: [main, v0.10.0]
84+
vllm_version: [main]
8585
steps:
8686
- name: Install packages
8787
run: |
@@ -137,7 +137,7 @@ jobs:
137137
max-parallel: 2
138138
matrix:
139139
os: [linux-aarch64-a2-1]
140-
vllm_version: [main, v0.10.0]
140+
vllm_version: [main]
141141
name: singlecard e2e test
142142
runs-on: ${{ matrix.os }}
143143
container:
@@ -185,9 +185,6 @@ jobs:
185185
run: |
186186
pip install -r requirements-dev.txt
187187
pip install -v -e .
188-
if [[ "${{ matrix.vllm_version }}" == "v0.10.0" ]]; then
189-
pip install "transformers<4.54.0"
190-
fi
191188
192189
- name: Run e2e test
193190
env:
@@ -222,7 +219,7 @@ jobs:
222219
max-parallel: 2
223220
matrix:
224221
os: [linux-aarch64-a2-2]
225-
vllm_version: [main, v0.10.0]
222+
vllm_version: [main]
226223
name: multicard e2e test
227224
runs-on: ${{ matrix.os }}
228225
container:
@@ -270,9 +267,6 @@ jobs:
270267
run: |
271268
pip install -r requirements-dev.txt
272269
pip install -v -e .
273-
if [[ "${{ matrix.vllm_version }}" == "v0.10.0" ]]; then
274-
pip install "transformers<4.54.0"
275-
fi
276270
277271
- name: Run vllm-project/vllm-ascend test
278272
env:

.github/workflows/vllm_ascend_test_310p.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ jobs:
5353
max-parallel: 2
5454
matrix:
5555
os: [linux-aarch64-310p-1, linux-aarch64-310p-4]
56-
vllm_version: [main, v0.10.0]
56+
vllm_version: [main]
5757
name: 310p e2e test
5858
runs-on: ${{ matrix.os }}
5959
container:

docs/source/tutorials/multi_node_kimi.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
referring to [multi_node.md](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_node.html#verification-process)
66

77
## Run with docker
8-
Assume you have two Atlas 800 A3(64G*16) nodes(or 4 *A2* 8), and want to deploy the `Kimi-K2-Instruct-W8A8` quantitative model across multi-node.
8+
Assume you have two Atlas 800 A3(64G*16) nodes(or 4 * A2), and want to deploy the `Kimi-K2-Instruct-W8A8` quantitative model across multi-node.
99

1010
```{code-block} bash
1111
:substitutions:

docs/source/tutorials/single_npu_qwen3_quantization.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,15 @@ see https://www.modelscope.cn/models/vllm-ascend/Qwen3-8B-W4A8
3232
:::
3333

3434
```bash
35+
git clone https://gitee.com/ascend/msit
36+
cd msit/msmodelslim
37+
3538
# Optional, this commit has been verified
36-
git clone https://gitee.com/ascend/msit -b f8ab35a772a6c1ee7675368a2aa4bafba3bedd1a
39+
git checkout f8ab35a772a6c1ee7675368a2aa4bafba3bedd1a
3740

38-
cd msit/msmodelslim
3941
# Install by run this script
4042
bash install.sh
43+
pip install accelerate
4144

4245
cd example/Qwen
4346
# Original weight path, Replace with your local model path

docs/source/user_guide/feature_guide/quantization.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@ Install modelslim:
1212

1313
```bash
1414
git clone https://gitee.com/ascend/msit
15+
cd msit/msmodelslim
16+
1517
# Optional, this commit has been verified
1618
git checkout f8ab35a772a6c1ee7675368a2aa4bafba3bedd1a
1719

18-
cd msit/msmodelslim
1920
bash install.sh
2021
pip install accelerate
2122
```

tests/ut/core/test_scheduler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def create_requests(
5050
request_id=f"{i}",
5151
prompt_token_ids=[i] * num_tokens,
5252
sampling_params=sampling_params,
53-
multi_modal_inputs=mm_inputs,
53+
multi_modal_kwargs=mm_inputs,
5454
multi_modal_placeholders=mm_position,
5555
multi_modal_hashes=None,
5656
eos_token_id=EOS_TOKEN_ID,

tests/ut/kv_connector/test_remote_decode_lifecycle.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
create_model_runner_output,
2626
create_request, create_scheduler,
2727
create_vllm_config)
28-
from vllm_ascend.utils import vllm_version_is
2928

3029

3130
def test_basic_lifecycle():
@@ -103,13 +102,10 @@ def test_basic_lifecycle():
103102

104103
# (3b): execute_model()
105104
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
106-
if vllm_version_is("0.10.0"):
107-
model_runner_output.finished_sending = [request_id]
108-
else:
109-
from vllm.v1.worker.kv_connector_model_runner_mixin import \
110-
KVConnectorOutput # type: ignore # noqa
111-
model_runner_output.kv_connector_output = KVConnectorOutput(
112-
finished_sending=[request_id])
105+
from vllm.v1.worker.kv_connector_model_runner_mixin import \
106+
KVConnectorOutput # type: ignore # noqa
107+
model_runner_output.kv_connector_output = KVConnectorOutput(
108+
finished_sending=[request_id])
113109

114110
# (3c): update_from_output()
115111
scheduler.update_from_output(scheduler_output, model_runner_output)
@@ -164,13 +160,10 @@ def test_prefix_cache_lifecycle():
164160
scheduler_output = scheduler.schedule()
165161
scheduler.schedule()
166162
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
167-
if vllm_version_is("0.10.0"):
168-
model_runner_output.finished_sending = [request_remote.request_id]
169-
else:
170-
from vllm.v1.worker.kv_connector_model_runner_mixin import \
171-
KVConnectorOutput # noqa
172-
model_runner_output.kv_connector_output = KVConnectorOutput(
173-
finished_sending=[request_remote.request_id])
163+
from vllm.v1.worker.kv_connector_model_runner_mixin import \
164+
KVConnectorOutput # noqa
165+
model_runner_output.kv_connector_output = KVConnectorOutput(
166+
finished_sending=[request_remote.request_id])
174167
scheduler.update_from_output(scheduler_output, model_runner_output)
175168
_ = scheduler.schedule()
176169
assert_scheduler_empty(scheduler)

tests/ut/kv_connector/test_remote_prefill_lifecycle.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
create_model_runner_output,
2626
create_request, create_scheduler,
2727
create_vllm_config)
28-
from vllm_ascend.utils import vllm_version_is
2928

3029

3130
def test_basic_lifecycle():
@@ -91,13 +90,10 @@ def test_basic_lifecycle():
9190

9291
# (2b): forward(): request finishes recv.
9392
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
94-
if vllm_version_is("0.10.0"):
95-
model_runner_output.finished_recving = [request_id]
96-
else:
97-
from vllm.v1.worker.kv_connector_model_runner_mixin import \
98-
KVConnectorOutput # type: ignore # noqa
99-
model_runner_output.kv_connector_output = KVConnectorOutput(
100-
finished_recving=[request_id])
93+
from vllm.v1.worker.kv_connector_model_runner_mixin import \
94+
KVConnectorOutput # type: ignore # noqa
95+
model_runner_output.kv_connector_output = KVConnectorOutput(
96+
finished_recving=[request_id])
10197

10298
# (2c): update_from_output():
10399
engine_core_outputs = scheduler.update_from_output(scheduler_output,
@@ -211,13 +207,10 @@ def test_full_block_prompt():
211207
# # STEP (2): Recv.
212208
scheduler_output = scheduler.schedule()
213209
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
214-
if vllm_version_is("0.10.0"):
215-
model_runner_output.finished_recving = [request_id]
216-
else:
217-
from vllm.v1.worker.kv_connector_model_runner_mixin import \
218-
KVConnectorOutput # type: ignore # noqa
219-
model_runner_output.kv_connector_output = KVConnectorOutput(
220-
finished_recving=[request_id])
210+
from vllm.v1.worker.kv_connector_model_runner_mixin import \
211+
KVConnectorOutput # type: ignore # noqa
212+
model_runner_output.kv_connector_output = KVConnectorOutput(
213+
finished_recving=[request_id])
221214
scheduler.update_from_output(scheduler_output, model_runner_output)
222215
assert len(scheduler.waiting) == 1
223216
assert (request_id in scheduler.finished_recving_kv_req_ids)

tests/ut/kv_connector/utils.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def create_request(
157157
request_id=f"id-{request_id}",
158158
prompt_token_ids=prompt_token_ids,
159159
sampling_params=sampling_params,
160-
multi_modal_inputs=None,
160+
multi_modal_kwargs=None,
161161
multi_modal_placeholders=None,
162162
multi_modal_hashes=None,
163163
**({
@@ -187,19 +187,11 @@ def create_model_runner_output(
187187

188188
# Make output data structure.
189189
extra_args = {}
190-
if not vllm_version_is("0.10.0"):
191-
from vllm.v1.worker.kv_connector_model_runner_mixin import \
192-
KVConnectorOutput # type: ignore # noqa
193-
kv_connector_output = KVConnectorOutput(
194-
finished_sending=finished_sending,
195-
finished_recving=finished_recving)
196-
extra_args = {"kv_connector_output": kv_connector_output}
197-
else:
198-
extra_args = {
199-
"finished_sending": finished_sending,
200-
"finished_recving": finished_recving,
201-
}
202-
190+
from vllm.v1.worker.kv_connector_model_runner_mixin import \
191+
KVConnectorOutput # type: ignore # noqa
192+
kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
193+
finished_recving=finished_recving)
194+
extra_args = {"kv_connector_output": kv_connector_output}
203195
return ModelRunnerOutput(
204196
req_ids=req_ids,
205197
req_id_to_index=req_id_to_index,

tests/ut/ops/test_layernorm.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from unittest.mock import patch
2+
3+
import pytest
4+
import torch
5+
from vllm.model_executor.layers.layernorm import RMSNorm
6+
7+
8+
@pytest.fixture
9+
def dummy_tensor():
10+
return torch.randn(4, 8, dtype=torch.float16)
11+
12+
13+
def mock_rms_norm(x, weight, eps):
14+
return x + 1, None
15+
16+
17+
def mock_add_rms_norm(x, residual, weight, eps):
18+
return 2 * x, None, 2 * residual
19+
20+
21+
@pytest.mark.parametrize("is_310p_return", [True, False])
22+
@pytest.mark.parametrize("residual",
23+
[None, torch.randn(4, 8, dtype=torch.float32)])
24+
@patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm)
25+
@patch("torch_npu.npu_add_rms_norm", side_effect=mock_add_rms_norm)
26+
def test_RMSNorm_forward(mock_add_rmsnorm, mock_rmsnorm, is_310p_return,
27+
residual, dummy_tensor):
28+
29+
with patch("vllm_ascend.utils.is_310p", return_value=is_310p_return):
30+
layer = RMSNorm(hidden_size=32, eps=1e-05)
31+
if residual is not None:
32+
out_x, out_residual = layer.forward_oot(dummy_tensor, residual)
33+
34+
if is_310p_return:
35+
expected_arg_x = dummy_tensor + residual.to(dummy_tensor.dtype)
36+
expected_out_x = expected_arg_x + 1
37+
expected_out_residual = expected_arg_x.to(residual.dtype)
38+
39+
mock_rmsnorm.assert_called_once()
40+
assert torch.allclose(out_x, expected_out_x)
41+
assert torch.allclose(out_residual, expected_out_residual)
42+
else:
43+
expected_out_x = 2 * dummy_tensor
44+
expected_out_residual = 2 * residual
45+
mock_add_rmsnorm.assert_called_once()
46+
assert torch.allclose(out_x, expected_out_x)
47+
assert torch.allclose(out_residual, expected_out_residual)
48+
else:
49+
out_x = layer.forward(dummy_tensor, residual)
50+
expected_out_x = dummy_tensor + 1
51+
52+
mock_rmsnorm.assert_called_once()
53+
assert torch.allclose(out_x, expected_out_x)

0 commit comments

Comments
 (0)