vllm-project · NicholasTao · Aug 5, 2025 · Aug 8, 2025 · Aug 11, 2025 · Aug 11, 2025
diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -21,8 +21,10 @@
 """
 import os
 from typing import Dict
+from unittest.mock import patch
 
 from tests.e2e.conftest import VllmRunner
+from vllm_ascend.ascend_forward_context import _get_fused_moe_state
 
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 
@@ -162,3 +164,67 @@ def test_e2e_pangu_with_torchair():
         },
     }
     _pangu_torchair_test_fixture(additional_config)
+
+
+def _qwen_torchair_test_fixture(
+    model,
+    tp,
+    enable_expert_parallel,
+):
+    # The current access control does not support 16 cards,
+    # so the MC2 operator in Qwen's graph mode cannot run.
+    # Once 16-card support is available,
+    # this e2e can be switched to graph mode.
+    example_prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    additional_config = {
+        "torchair_graph_config": {
+            "enabled": False,
+        },
+        "ascend_scheduler_config": {
+            "enabled": True,
+        },
+        "refresh": True,
+    }
+
+    with VllmRunner(
+            model,
+            dtype="half",
+            tensor_parallel_size=tp,
+            distributed_executor_backend="mp",
+            enforce_eager=True,
+            additional_config=additional_config,
+            enable_expert_parallel=enable_expert_parallel,
+    ) as vllm_model:
+        # use greedy sampler to make sure the generated results are fix
+        vllm_output = vllm_model.generate_greedy(example_prompts, 5)
+
+    # NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
+    # with 2 hidden layers, thus the golden results seems inaccurate.
+    # This will only change if accuracy changes with the official weights
+    # of PanguProMoE.
+    golden_results = [
+        'Hello, my name is Remempondeprecatedmiot忱',
+        'The president of the United States is Remem下的一个 rever ceremoni Segnali',
+        'The capital of France is Rememvoud administrativ Remem投',
+        'The future of AI isotope Segnali Zoeken精细化 supus',
+    ]
+
+    assert len(golden_results) == len(vllm_output)
+    for i in range(len(vllm_output)):
+        print(f"Generated text: {vllm_output[i][1]!r}")
+
+
+def test_e2e_qwen3_moe_with_torchair():
+
+    def stubbed_get_state(ep_size, with_prefill, is_deepseek_v3_r1):
+        return _get_fused_moe_state(16, with_prefill, is_deepseek_v3_r1)
+
+    with patch('vllm_ascend.ascend_forward_context._get_fused_moe_state',
+               side_effect=stubbed_get_state):
+        _qwen_torchair_test_fixture("Qwen/Qwen3-30B-A3B", 2, True)
diff --git a/tests/ut/models/test_qwen3_moe.py b/tests/ut/models/test_qwen3_moe.py
@@ -12,11 +12,15 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
+import math
+import unittest
 
 import pytest
+import torch
 from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
 
-from vllm_ascend.models.qwen3_moe import CustomQwen3MoeForCausalLM
+from vllm_ascend.models.qwen3_moe import (CustomQwen3MoeAttention,
+                                          CustomQwen3MoeForCausalLM)
 
 
 class TestCustomQwen3MoeForCausalLM:
@@ -44,3 +48,51 @@ def test_packed_modules_mapping_structure(self):
             ]
         }
         assert CustomQwen3MoeForCausalLM.packed_modules_mapping == expected_mapping
+
+
+class DummyRMSNorm:
+
+    def __init__(self, dim: int, eps: float = 1e-6):
+        self.dim = dim
+        self.eps = eps
+
+    def __call__(self, x):
+        mean_sq = x.pow(2).mean(dim=-1, keepdim=True)
+        denom = (mean_sq + self.eps).sqrt()
+        return x / denom
+
+
+class TestCustomQwen3MoeAttention(unittest.TestCase):
+
+    def setUp(self):
+        self.batch = 2
+        self.seq_len = 3
+        self.q_size = 8
+        self.kv_size = 8
+        self.head_dim = 4
+        self.rms_eps = 1e-6
+
+        total_dim = self.q_size + 2 * self.kv_size
+
+        self.qkv = torch.arange(self.batch * self.seq_len * total_dim,
+                                dtype=torch.float32).reshape(
+                                    self.batch, self.seq_len, total_dim)
+
+    def test_constant_input_normalization(self):
+        ones_qkv = torch.ones((1, 1, self.q_size + 2 * self.kv_size),
+                              dtype=torch.float32)
+
+        q_norm = DummyRMSNorm(self.head_dim, self.rms_eps)
+        k_norm = DummyRMSNorm(self.head_dim, self.rms_eps)
+        q, k, v = CustomQwen3MoeAttention.normalize_qkv(
+            ones_qkv, self.q_size, self.kv_size, self.head_dim, q_norm, k_norm)
+
+        norm_val = 1.0 / math.sqrt(1.0 + self.rms_eps)
+
+        expected_q = torch.full((1, 1, self.q_size), norm_val)
+        expected_k = torch.full((1, 1, self.kv_size), norm_val)
+        expected_v = torch.ones((1, 1, self.kv_size), dtype=torch.float32)
+
+        self.assertTrue(torch.allclose(q, expected_q, atol=1e-6))
+        self.assertTrue(torch.allclose(k, expected_k, atol=1e-6))
+        self.assertTrue(torch.equal(v, expected_v))
diff --git a/tests/ut/ops/test_rotary_embedding.py b/tests/ut/ops/test_rotary_embedding.py
@@ -1,12 +1,19 @@
 import math
+from unittest import mock
 from unittest.mock import MagicMock, patch
 
+import pytest
 import torch
+import torch_npu
 
 from tests.ut.base import TestBase
+from vllm_ascend.ops.rotary_embedding import __set_cos_sin_cache  # noqa E402
+from vllm_ascend.ops.rotary_embedding import \
+    __set_cos_sin_cache as raw__set_cos_sin_cache
 from vllm_ascend.ops.rotary_embedding import (custom_rotary_embedding_enabled,
                                               native_rope_deepseek_forward,
-                                              rope_forward_oot, rotate_half,
+                                              rope_forward, rope_forward_oot,
+                                              rotate_half,
                                               yarn_find_correction_dim,
                                               yarn_get_mscale)
 
@@ -312,3 +319,113 @@ def test_scale_greater_than_1(self):
                 expected,
                 places=6,
                 msg=f"Failed for scale={scale}, mscale={mscale}")
+
+
+class MockRotaryEmbedding:
+
+    def __init__(self, base, rotary_dim, max_position_embeddings):
+        self.base = base
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+
+
+@pytest.fixture
+def dummy_module():
+    return MockRotaryEmbedding(base=10000.0,
+                               rotary_dim=64,
+                               max_position_embeddings=512)
+
+
+class TestSetCosSinCache:
+
+    def test_set_cos_sin_cache_generates_real_tensors(self, dummy_module):
+        calls = []
+
+        def fake_register_buffer(name, tensor, persistent=True):
+            setattr(dummy_module, name, tensor)
+            calls.append(name)
+
+        dummy_module.register_buffer = fake_register_buffer
+        seq_len = 128
+        device = torch.device("cpu")
+        dtype = torch.float32
+
+        raw__set_cos_sin_cache(dummy_module, seq_len, device, dtype)
+
+        assert calls == ['inv_freq', 'cos', 'sin']
+
+        assert isinstance(dummy_module.inv_freq, torch.Tensor)
+        assert dummy_module.inv_freq.shape == (dummy_module.rotary_dim // 2, )
+        assert dummy_module.inv_freq.device == device
+        assert dummy_module.inv_freq.dtype == torch.float32
+
+        expected_shape = (dummy_module.max_position_embeddings,
+                          dummy_module.rotary_dim)
+        for name in ('cos', 'sin'):
+            buf = getattr(dummy_module, name)
+            assert isinstance(buf, torch.Tensor)
+            assert buf.shape == expected_shape
+            assert buf.device == device
+            assert buf.dtype == torch.float32
+
+
+class DummyConfig:
+
+    class TorchairGraphConfig:
+        enabled = True
+
+    torchair_graph_config = TorchairGraphConfig()
+
+
+class DummyModel:
+
+    def __init__(self, head_size, max_pos):
+        self.head_size = head_size
+        self.max_position_embeddings = max_pos
+        self.cos = torch.randn(max_pos, head_size)
+        self.sin = torch.randn(max_pos, head_size)
+
+    def embed(self, positions, weight):
+        B, S = positions.shape
+        return torch.ones(B, S, self.head_size) * 0.5
+
+
+@mock.patch("vllm_ascend.ops.rotary_embedding.get_ascend_config",
+            return_value=DummyConfig())
+@mock.patch.object(torch_npu, "npu_apply_rotary_pos_emb")
+@mock.patch("vllm_ascend.ops.rotary_embedding.__set_cos_sin_cache")
+def test_rope_forward_output_shape(mock_set_cache, mock_npu_apply,
+                                   mock_get_ascend_config):
+    batch_size = 2
+    seq_len = 4
+    num_heads = 3
+    head_size = 5
+
+    q = torch.randn(batch_size, seq_len, num_heads * head_size)
+    k = torch.randn_like(q)
+
+    positions = torch.arange(seq_len).unsqueeze(0).repeat(batch_size, 1)
+
+    model = DummyModel(head_size=head_size, max_pos=100)
+
+    def fake_apply_rotary(q_in, k_in, cos, sin):
+        return q_in, k_in
+
+    mock_npu_apply.side_effect = fake_apply_rotary
+
+    q_out, k_out = rope_forward(
+        model,
+        positions=positions,
+        query=q,
+        key=k,
+        offsets=None,
+        is_neox_style_override=None,
+        max_seq_len=None,
+        is_prefill=False,  # no rope_forward_oot
+        is_qwen_torchair=True,  # go rotary
+    )
+
+    assert q_out.shape == (batch_size, 1, seq_len, num_heads * head_size)
+    assert k_out.shape == (batch_size, 1, seq_len, num_heads * head_size)
+
+    mock_set_cache.assert_not_called()
diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py
@@ -232,7 +232,7 @@ def test_check_ascend_config_wrong_case(self):
 
     def test_check_torchair_supported(self):
         test_cases = [('deepseek_v3', True), ('PanguProMoE', True),
-                      ('qwen', False), ('llama', False)]
+                      ('qwen', True), ('llama', False)]
         for model_type, expected_output in test_cases:
             self.assertEqual(_check_torchair_supported(model_type),
                              expected_output)

diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -17,7 +17,7 @@
 
 from vllm.logger import logger
 
-TORCHAIR_MODEL_LIST = ["deepseek", "pangu", "kimi_k2"]
+TORCHAIR_MODEL_LIST = ["deepseek", "pangu", "kimi_k2", "qwen"]
 
 
 def _check_torchair_supported(model_type: str):
@@ -159,7 +159,7 @@ def check_ascend_config(vllm_config, enforce_eager):
     else:
         # torchair_graph case
         if ascend_config.torchair_graph_config.enabled:
-            # torchair_graph is supported for deepseek/pangu model only.
+            # torchair_graph is supported for deepseek/pangu/qwen model only.
             if vllm_config.model_config:
                 model_type = vllm_config.model_config.hf_config.model_type
                 if not _check_torchair_supported(model_type):

diff --git a/vllm_ascend/attention/attention_v1_torchair.py b/vllm_ascend/attention/attention_v1_torchair.py
@@ -378,8 +378,9 @@ def forward(
             shape = [batch_size * seq_len, num_heads, head_size]
         """
         num_tokens = query.shape[0]
-        use_kv_cache_quant = kv_cache is not None and kv_cache[0].numel(
-        ) > 0 and kv_cache[0].dtype == torch.int8
+        use_kv_cache_quant = (kv_cache is not None and len(kv_cache) > 0
+                              and kv_cache[0].numel() > 0
+                              and kv_cache[0].dtype == torch.int8)
         if output is None:
             output = torch.empty(num_tokens,
                                  self.num_heads,

diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py
@@ -59,3 +59,6 @@ def register_model():
     ModelRegistry.register_model(
         "PanguProMoEForCausalLM",
         "vllm_ascend.models.pangu_moe:PanguProMoEForCausalLM")
+
+    ModelRegistry.register_model(
+        "Qwen2ForCausalLM", "vllm_ascend.models.qwen2:CustomQwen2ForCausalLM")