[0.9.1][bugfix] Address abnormal VRAM increase in quantized models with floating-point MTP (#2554)

SlightwindSec · web-flow · commit 128c1209c075 · 2025-08-27T10:44:11.000+08:00
### **Problem & Cause** VRAM usage increased abnormally during mixed-precision inference with quantized models and floating-point MTP. This was caused by `dist.all_to_all_single` creating extra HCCL communicators, which produced unnecessary buffers that consumed more memory. ### **Solution** This commit adds a communicator parameter to `dist.all_to_all_single`. By passing the existing communicator from the `vllm-ascend` framework, we ensure all communication operations use a unified domain, preventing the creation of extra buffers and solving the VRAM issue. ### **Collaborators** @kunpengW-code cc @farawayboat @MengqingCao Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -526,7 +526,9 @@ def fused_experts_with_all2all(hidden_states: torch.Tensor,
 
         gather_sizes = global_expert_tokens.new_empty(
             global_expert_tokens.shape[0])
-        dist.all_to_all_single(gather_sizes, global_expert_tokens)
+        dist.all_to_all_single(gather_sizes,
+                               global_expert_tokens,
+                               group=ep_group.device_group)
 
         token_counts_combined = torch.stack(
             [gather_sizes, global_expert_tokens], dim=0)
@@ -542,10 +544,16 @@ def fused_experts_with_all2all(hidden_states: torch.Tensor,
         gather_size_list = token_counts_combined_cpu[1]
         scatter_size_list = token_counts_combined_cpu[0]
 
-        dist.all_to_all_single(gathered_tokens, quantized_tokens,
-                               scatter_size_list, gather_size_list)
-        dist.all_to_all_single(dynamic_scale, token_scales, scatter_size_list,
-                               gather_size_list)
+        dist.all_to_all_single(gathered_tokens,
+                               quantized_tokens,
+                               scatter_size_list,
+                               gather_size_list,
+                               group=ep_group.device_group)
+        dist.all_to_all_single(dynamic_scale,
+                               token_scales,
+                               scatter_size_list,
+                               gather_size_list,
+                               group=ep_group.device_group)
 
         hidden_states, dynamic_scale, inverse_indices, expert_tokens = torch_npu.npu_moe_re_routing(
             gathered_tokens,
@@ -593,8 +601,11 @@ def fused_experts_with_all2all(hidden_states: torch.Tensor,
             index=inverse_indices.to(torch.float32).argsort().to(torch.int32))
 
         hidden_states = reordered_outputs.new_empty(*quantized_tokens.shape)
-        dist.all_to_all_single(hidden_states, reordered_outputs,
-                               gather_size_list, scatter_size_list)
+        dist.all_to_all_single(hidden_states,
+                               reordered_outputs,
+                               gather_size_list,
+                               scatter_size_list,
+                               group=ep_group.device_group)
 
         final_hidden_states = torch_npu.npu_moe_finalize_routing(
             hidden_states,