Skip to content

Torchair graph812 cov #2337

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions tests/e2e/multicard/test_torchair_graph_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
"""
import os
from typing import Dict
from unittest.mock import patch

from tests.e2e.conftest import VllmRunner
from vllm_ascend.ascend_forward_context import _get_fused_moe_state

os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"

Expand Down Expand Up @@ -162,3 +164,67 @@ def test_e2e_pangu_with_torchair():
},
}
_pangu_torchair_test_fixture(additional_config)


def _qwen_torchair_test_fixture(
model,
tp,
enable_expert_parallel,
):
# The current access control does not support 16 cards,
# so the MC2 operator in Qwen's graph mode cannot run.
# Once 16-card support is available,
# this e2e can be switched to graph mode.
example_prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]

additional_config = {
"torchair_graph_config": {
"enabled": False,
},
"ascend_scheduler_config": {
"enabled": True,
},
"refresh": True,
}

with VllmRunner(
model,
dtype="half",
tensor_parallel_size=tp,
distributed_executor_backend="mp",
enforce_eager=True,
additional_config=additional_config,
enable_expert_parallel=enable_expert_parallel,
) as vllm_model:
# use greedy sampler to make sure the generated results are fix
vllm_output = vllm_model.generate_greedy(example_prompts, 5)

# NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
# with 2 hidden layers, thus the golden results seems inaccurate.
# This will only change if accuracy changes with the official weights
# of PanguProMoE.
golden_results = [
'Hello, my name is Remempondeprecatedmiot忱',
'The president of the United States is Remem下的一个 rever ceremoni Segnali',
'The capital of France is Rememvoud administrativ Remem投',
'The future of AI isotope Segnali Zoeken精细化 supus',
]

assert len(golden_results) == len(vllm_output)
for i in range(len(vllm_output)):
print(f"Generated text: {vllm_output[i][1]!r}")


def test_e2e_qwen3_moe_with_torchair():

def stubbed_get_state(ep_size, with_prefill, is_deepseek_v3_r1):
return _get_fused_moe_state(16, with_prefill, is_deepseek_v3_r1)

with patch('vllm_ascend.ascend_forward_context._get_fused_moe_state',
side_effect=stubbed_get_state):
_qwen_torchair_test_fixture("Qwen/Qwen3-30B-A3B", 2, True)
54 changes: 53 additions & 1 deletion tests/ut/models/test_qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,15 @@
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import math
import unittest

import pytest
import torch
from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM

from vllm_ascend.models.qwen3_moe import CustomQwen3MoeForCausalLM
from vllm_ascend.models.qwen3_moe import (CustomQwen3MoeAttention,
CustomQwen3MoeForCausalLM)


class TestCustomQwen3MoeForCausalLM:
Expand Down Expand Up @@ -44,3 +48,51 @@ def test_packed_modules_mapping_structure(self):
]
}
assert CustomQwen3MoeForCausalLM.packed_modules_mapping == expected_mapping


class DummyRMSNorm:

def __init__(self, dim: int, eps: float = 1e-6):
self.dim = dim
self.eps = eps

def __call__(self, x):
mean_sq = x.pow(2).mean(dim=-1, keepdim=True)
denom = (mean_sq + self.eps).sqrt()
return x / denom


class TestCustomQwen3MoeAttention(unittest.TestCase):

def setUp(self):
self.batch = 2
self.seq_len = 3
self.q_size = 8
self.kv_size = 8
self.head_dim = 4
self.rms_eps = 1e-6

total_dim = self.q_size + 2 * self.kv_size

self.qkv = torch.arange(self.batch * self.seq_len * total_dim,
dtype=torch.float32).reshape(
self.batch, self.seq_len, total_dim)

def test_constant_input_normalization(self):
ones_qkv = torch.ones((1, 1, self.q_size + 2 * self.kv_size),
dtype=torch.float32)

q_norm = DummyRMSNorm(self.head_dim, self.rms_eps)
k_norm = DummyRMSNorm(self.head_dim, self.rms_eps)
q, k, v = CustomQwen3MoeAttention.normalize_qkv(
ones_qkv, self.q_size, self.kv_size, self.head_dim, q_norm, k_norm)

norm_val = 1.0 / math.sqrt(1.0 + self.rms_eps)

expected_q = torch.full((1, 1, self.q_size), norm_val)
expected_k = torch.full((1, 1, self.kv_size), norm_val)
expected_v = torch.ones((1, 1, self.kv_size), dtype=torch.float32)

self.assertTrue(torch.allclose(q, expected_q, atol=1e-6))
self.assertTrue(torch.allclose(k, expected_k, atol=1e-6))
self.assertTrue(torch.equal(v, expected_v))
119 changes: 118 additions & 1 deletion tests/ut/ops/test_rotary_embedding.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
import math
from unittest import mock
from unittest.mock import MagicMock, patch

import pytest
import torch
import torch_npu

from tests.ut.base import TestBase
from vllm_ascend.ops.rotary_embedding import __set_cos_sin_cache # noqa E402
from vllm_ascend.ops.rotary_embedding import \
__set_cos_sin_cache as raw__set_cos_sin_cache
from vllm_ascend.ops.rotary_embedding import (custom_rotary_embedding_enabled,
native_rope_deepseek_forward,
rope_forward_oot, rotate_half,
rope_forward, rope_forward_oot,
rotate_half,
yarn_find_correction_dim,
yarn_get_mscale)

Expand Down Expand Up @@ -312,3 +319,113 @@ def test_scale_greater_than_1(self):
expected,
places=6,
msg=f"Failed for scale={scale}, mscale={mscale}")


class MockRotaryEmbedding:

def __init__(self, base, rotary_dim, max_position_embeddings):
self.base = base
self.rotary_dim = rotary_dim
self.max_position_embeddings = max_position_embeddings


@pytest.fixture
def dummy_module():
return MockRotaryEmbedding(base=10000.0,
rotary_dim=64,
max_position_embeddings=512)


class TestSetCosSinCache:

def test_set_cos_sin_cache_generates_real_tensors(self, dummy_module):
calls = []

def fake_register_buffer(name, tensor, persistent=True):
setattr(dummy_module, name, tensor)
calls.append(name)

dummy_module.register_buffer = fake_register_buffer
seq_len = 128
device = torch.device("cpu")
dtype = torch.float32

raw__set_cos_sin_cache(dummy_module, seq_len, device, dtype)

assert calls == ['inv_freq', 'cos', 'sin']

assert isinstance(dummy_module.inv_freq, torch.Tensor)
assert dummy_module.inv_freq.shape == (dummy_module.rotary_dim // 2, )
assert dummy_module.inv_freq.device == device
assert dummy_module.inv_freq.dtype == torch.float32

expected_shape = (dummy_module.max_position_embeddings,
dummy_module.rotary_dim)
for name in ('cos', 'sin'):
buf = getattr(dummy_module, name)
assert isinstance(buf, torch.Tensor)
assert buf.shape == expected_shape
assert buf.device == device
assert buf.dtype == torch.float32


class DummyConfig:

class TorchairGraphConfig:
enabled = True

torchair_graph_config = TorchairGraphConfig()


class DummyModel:

def __init__(self, head_size, max_pos):
self.head_size = head_size
self.max_position_embeddings = max_pos
self.cos = torch.randn(max_pos, head_size)
self.sin = torch.randn(max_pos, head_size)

def embed(self, positions, weight):
B, S = positions.shape
return torch.ones(B, S, self.head_size) * 0.5


@mock.patch("vllm_ascend.ops.rotary_embedding.get_ascend_config",
return_value=DummyConfig())
@mock.patch.object(torch_npu, "npu_apply_rotary_pos_emb")
@mock.patch("vllm_ascend.ops.rotary_embedding.__set_cos_sin_cache")
def test_rope_forward_output_shape(mock_set_cache, mock_npu_apply,
mock_get_ascend_config):
batch_size = 2
seq_len = 4
num_heads = 3
head_size = 5

q = torch.randn(batch_size, seq_len, num_heads * head_size)
k = torch.randn_like(q)

positions = torch.arange(seq_len).unsqueeze(0).repeat(batch_size, 1)

model = DummyModel(head_size=head_size, max_pos=100)

def fake_apply_rotary(q_in, k_in, cos, sin):
return q_in, k_in

mock_npu_apply.side_effect = fake_apply_rotary

q_out, k_out = rope_forward(
model,
positions=positions,
query=q,
key=k,
offsets=None,
is_neox_style_override=None,
max_seq_len=None,
is_prefill=False, # no rope_forward_oot
is_qwen_torchair=True, # go rotary
)

assert q_out.shape == (batch_size, 1, seq_len, num_heads * head_size)
assert k_out.shape == (batch_size, 1, seq_len, num_heads * head_size)

mock_set_cache.assert_not_called()
2 changes: 1 addition & 1 deletion tests/ut/test_ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def test_check_ascend_config_wrong_case(self):

def test_check_torchair_supported(self):
test_cases = [('deepseek_v3', True), ('PanguProMoE', True),
('qwen', False), ('llama', False)]
('qwen', True), ('llama', False)]
for model_type, expected_output in test_cases:
self.assertEqual(_check_torchair_supported(model_type),
expected_output)
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from vllm.logger import logger

TORCHAIR_MODEL_LIST = ["deepseek", "pangu", "kimi_k2"]
TORCHAIR_MODEL_LIST = ["deepseek", "pangu", "kimi_k2", "qwen"]


def _check_torchair_supported(model_type: str):
Expand Down Expand Up @@ -159,7 +159,7 @@ def check_ascend_config(vllm_config, enforce_eager):
else:
# torchair_graph case
if ascend_config.torchair_graph_config.enabled:
# torchair_graph is supported for deepseek/pangu model only.
# torchair_graph is supported for deepseek/pangu/qwen model only.
if vllm_config.model_config:
model_type = vllm_config.model_config.hf_config.model_type
if not _check_torchair_supported(model_type):
Expand Down
5 changes: 3 additions & 2 deletions vllm_ascend/attention/attention_v1_torchair.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,9 @@ def forward(
shape = [batch_size * seq_len, num_heads, head_size]
"""
num_tokens = query.shape[0]
use_kv_cache_quant = kv_cache is not None and kv_cache[0].numel(
) > 0 and kv_cache[0].dtype == torch.int8
use_kv_cache_quant = (kv_cache is not None and len(kv_cache) > 0
and kv_cache[0].numel() > 0
and kv_cache[0].dtype == torch.int8)
if output is None:
output = torch.empty(num_tokens,
self.num_heads,
Expand Down
3 changes: 3 additions & 0 deletions vllm_ascend/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,6 @@ def register_model():
ModelRegistry.register_model(
"PanguProMoEForCausalLM",
"vllm_ascend.models.pangu_moe:PanguProMoEForCausalLM")

ModelRegistry.register_model(
"Qwen2ForCausalLM", "vllm_ascend.models.qwen2:CustomQwen2ForCausalLM")
Loading
Loading