[Quickfix] Fix dp+ep+tp error when sp chunked the hidden_states (#3246)

MengqingCao · web-flow · commit 050d202bb97d · 2025-09-29T09:12:49.000+08:00
### What this PR does / why we need it?
Fix dp+ep+tp inplace copy error when sp chunked the `hidden_states`.


### How was this patch tested?
test locally with the following scripts
```bash
python examples/offline_data_parallel.py \
        --model="Qwen/Qwen3-30B-A3B" \
        --dp-size=2 \
        --tp-size=2 \
        --enable-expert-parallel
```

Signed-off-by: MengqingCao &lt;cmq0113@163.com&gt;
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -295,6 +295,7 @@ def __init__(
                 in_dtype=params_dtype,
             )
         self.moe_config = moe
+        # TODO: The self.moe_config.tp_size here is not correct, fixme soon
 
         if quant_config is None:
             self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -16,6 +16,7 @@
 #
 
 import gc
+import os
 from datetime import timedelta
 from typing import TYPE_CHECKING, Optional, Tuple
 
@@ -260,6 +261,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             compilation_config.level = CompilationLevel.NO_COMPILATION
 
         if parallel_config and parallel_config.worker_cls == "auto":
+            # TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm.
+            os.environ["VLLM_ALL2ALL_BACKEND"] = "flashinfer_all2allv"
             if ascend_config.torchair_graph_config.enabled or ascend_config.enable_shared_expert_dp:
                 parallel_config.worker_cls = "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker"
             else:

Original file line number	Diff line number	Diff line change
`@@ -295,6 +295,7 @@ def __init__(`
`295`	`295`	`in_dtype=params_dtype,`
`296`	`296`	`)`
`297`	`297`	`self.moe_config = moe`
	`298`	`+ # TODO: The self.moe_config.tp_size here is not correct, fixme soon`
`298`	`299`
`299`	`300`	`if quant_config is None:`
`300`	`301`	`self.quant_method = AscendUnquantizedFusedMoEMethod(moe)`