[0.9.1][Bugfix][Aclgraph] Fix qwen3-moe + aclgraph + tp (#2647)

MengqingCao · web-flow · commit a3970e8da0a5 · 2025-09-01T11:38:54.000+08:00
### What this PR does / why we need it?
Qwen3 moe + Aclgraph only support pure tp scenario. Cause in v0.9.1-dev
branch, aclgraph only support allgather.
This pr change to use `AscendSparseMoeBlock` to make aclgraph work with
tp

### Does this PR introduce _any_ user-facing change?
Users could only run Qwen3 moe + pure tp when enabling aclgraph

### How was this patch tested?
CI passed with new added test.

---------

Signed-off-by: MengqingCao &lt;cmq0113@163.com&gt;
diff --git a/tests/multicard/test_qwen3_moe.py b/tests/multicard/test_qwen3_moe.py
@@ -24,17 +24,18 @@
 from tests.conftest import VllmRunner
 
 
-def test_models_distributed_Qwen3_MOE_TP2():
+def test_models_distributed_Qwen3_MOE_Aclgraph_TP2():
     example_prompts = [
         "Hello, my name is",
     ]
-    dtype = "half"
+    dtype = "bfloat16"
     max_tokens = 5
     with VllmRunner(
             "Qwen/Qwen3-30B-A3B",
             dtype=dtype,
             tensor_parallel_size=4,
             distributed_executor_backend="mp",
+            enforce_eager=False,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
@@ -22,7 +22,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, CompilationLevel, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -32,8 +32,7 @@
 from vllm.model_executor.models.interfaces import SupportsPP
 from vllm.model_executor.models.qwen3_moe import (Qwen3MoeAttention,
                                                   Qwen3MoeForCausalLM,
-                                                  Qwen3MoeMLP, Qwen3MoeModel,
-                                                  Qwen3MoeSparseMoeBlock)
+                                                  Qwen3MoeMLP, Qwen3MoeModel)
 from vllm.model_executor.models.utils import (
     extract_layer_index, make_empty_intermediate_tensors_factory, make_layers,
     maybe_prefix)
@@ -79,21 +78,12 @@ def __init__(
         layer_idx = extract_layer_index(prefix)
         mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
                            config.mlp_only_layers)
-        use_aclgraph = (vllm_config is not None
-                        and vllm_config.compilation_config.level
-                        == CompilationLevel.PIECEWISE
-                        and not vllm_config.model_config.enforce_eager)
         if (layer_idx not in mlp_only_layers) and (
                 config.num_experts > 0 and
             (layer_idx + 1) % config.decoder_sparse_step == 0):
-            if not use_aclgraph:
-                self.mlp = AscendSparseMoeBlock(config=config,
-                                                quant_config=quant_config,
-                                                prefix=f"{prefix}.mlp")
-            else:
-                self.mlp = Qwen3MoeSparseMoeBlock(config=config,
-                                                  quant_config=quant_config,
-                                                  prefix=f"{prefix}.mlp")
+            self.mlp = AscendSparseMoeBlock(config=config,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.mlp")
         else:
             self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
                                    intermediate_size=config.intermediate_size,