Feat-4470

tanqingshan (A) · tanqingshan (A) · commit 169c4bd36e39 · 2025-12-04T21:26:23.000+08:00
Signed-off-by: tanqingshan (A) &lt;50050625@china.huawei.com&gt;
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -34,6 +34,7 @@ class AscendConfig:
 
     def __init__(self, vllm_config):
         additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {}
+        self.mix_placement = additional_config.get("mix_placement",False)
         torchair_graph_config = additional_config.get("torchair_graph_config",
                                                       {})
 
@@ -368,4 +369,4 @@ def check_ascend_config(vllm_config, enforce_eager):
                     logger.warning(
                         "ACL Graph is currently experimental. Please "
                         "raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
-                        " if you encourage any Error")
+                        " if you encourage any Error")
diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py
@@ -221,12 +221,13 @@ def _export_tensor_to_file(self, expert_maps, expert_map_record_path: str):
                 json.dump(record, f, indent=4)
 
     def do_update_expert_map(self, layer_id, updated_expert_map):
-        pad_len = self.expert_map_per_layer[layer_id].shape[
-            0] - updated_expert_map.shape[0]
-        updated_expert_map_padded = torch.nn.functional.pad(updated_expert_map,
-                                                            pad=(0, pad_len),
-                                                            mode='constant',
-                                                            value=-1)
+        pad_len = self.expert_map_per_layer[layer_id].shape[0] - updated_expert_map.shape[0]
+        updated_expert_map_padded = torch.nn.functional.pad(
+                                    updated_expert_map,
+                                    pad=(0,pad_len),
+                                    mode='constant',
+                                    value=-1
+                                    )
         self.expert_map_per_layer[layer_id].copy_(updated_expert_map_padded)
         self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map)
 
@@ -240,15 +241,14 @@ def do_update_expert_weight(self, layer_id, local_expert_to_replace,
 
     def do_update_log2phy_map(self, layer_id, updated_log2phy_map):
         if self.log2phy_map_per_layer[layer_id] is not None:
-            pad_len = self.log2phy_map_per_layer[layer_id].shape[
-                0] - updated_log2phy_map.shape[0]
+            pad_len = self.log2phy_map_per_layer[layer_id].shape[0] - updated_log2phy_map.shape[0]
             updated_log2phy_map_padded = torch.nn.functional.pad(
-                updated_log2phy_map,
-                pad=(0, pad_len),
-                mode='constant',
-                value=-1)
-            self.log2phy_map_per_layer[layer_id].copy_(
-                updated_log2phy_map_padded)
+                                        updated_log2phy_map,
+                                        pad=(0,pad_len),
+                                        mode='constant',
+                                        value=-1
+                                        )
+            self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map_padded)
 
     def global2local(self, placement: torch.Tensor,
                      E_local: int) -> torch.Tensor:
diff --git a/vllm_ascend/ops/fused_moe/experts_selector.py b/vllm_ascend/ops/fused_moe/experts_selector.py
@@ -33,6 +33,8 @@ def select_experts(hidden_states: torch.Tensor,
                    routed_scaling_factor=1.0,
                    e_score_correction_bias: Optional[torch.Tensor] = None,
                    indices_type: Optional[torch.dtype] = None,
+                   mix_placement: Optional[bool] = False,
+                   num_logical_experts: int = -1,
                    global_num_experts: int = -1):
     """
     Fused experts with select experts.
@@ -87,6 +89,19 @@ def select_experts(hidden_states: torch.Tensor,
             e_score_correction_bias=e_score_correction_bias,
             global_num_experts=global_num_experts,
         )
+    if mix_placement:
+        pad_shared_expert_ids = torch.full((topk_ids.shape[0], 1),
+                                           num_logical_experts,
+                                           dtype=topk_ids.dtype,
+                                           device=topk_ids.device)
+
+        pad_shared_expert_weights = torch.full((topk_weights.shape[0], 1),
+                                               0.4,
+                                               dtype=topk_weights.dtype,
+                                               device=topk_weights.device)
+        topk_ids = torch.cat([topk_ids, pad_shared_expert_ids], dim=1)
+        topk_weights = torch.cat([topk_weights, pad_shared_expert_weights], 
+                                 dim=1)
     return topk_weights, topk_ids
 
 
@@ -271,4 +286,4 @@ def _native_select_experts(
     topk_ids = topk_ids.to(torch.int32)
     topk_weights = _renormalize_topk_weights(topk_weights, renormalize)
 
-    return topk_weights, topk_ids
+    return topk_weights, topk_ids
diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -171,10 +171,10 @@ def __init__(self, *args, **kwargs):
         self.moe_config.dp_group = get_dp_group()
         self.moe_config.ep_group = get_ep_group()
         self.moe_config.mc2_group = get_mc2_group()
-        ascend_config = get_ascend_config()
-        self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
-        self.expert_map_path = ascend_config.expert_map_path
-        self.global_redundant_expert_num = ascend_config.init_redundancy_expert
+        self.ascend_config = get_ascend_config()
+        self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path
+        self.expert_map_path = self.ascend_config.expert_map_path
+        self.global_redundant_expert_num = self.ascend_config.init_redundancy_expert
         self.global_num_experts = num_experts + self.global_redundant_expert_num
         if self.custom_routing_function is None and self.e_score_correction_bias is not None:
             vllm_config = get_current_vllm_config()
@@ -194,8 +194,8 @@ def __init__(self, *args, **kwargs):
             self.expert_load_balancer = ExpertLoadBalancer(
                 self.expert_map_path, num_experts)
             self.expert_load_balancer.check_expert_map_tensor()
-            self.global_redundant_expert_num = (
-                self.expert_load_balancer.get_global_redundant_expert_num())
+            # self.global_redundant_expert_num = (
+            #     self.expert_load_balancer.get_global_redundant_expert_num())
             self.global_num_experts = num_experts + self.global_redundant_expert_num
             try:
                 self.local_num_experts, self.expert_map = (
@@ -253,7 +253,7 @@ def __init__(self, *args, **kwargs):
             moe_quant_params["intermediate_size_full"] = intermediate_size
         self.quant_method.create_weights(layer=self, **moe_quant_params)
 
-        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
+        self.enable_shared_expert_dp = self.ascend_config.enable_shared_expert_dp
 
         setup_moe_comm_method(self.moe_config)
         self.quant_type = self._get_quant_type()
@@ -459,8 +459,8 @@ def __init__(
         self._shared_experts = shared_experts
         self.use_overlapped = use_overlapped
         self.shared_expert_stream = None
-        ascend_config = get_ascend_config()
-        self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert
+        self.ascend_config = get_ascend_config()
+        self.multistream_overlap_shared_expert = self.ascend_config.multistream_overlap_shared_expert
         if enable_sp():
             logger.info_once(
                 "Sequence parallelism is enabled, shared experts are replicated for best performance."
@@ -488,11 +488,19 @@ def forward(
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        shared_out, fused_out = AscendFusedMoE.forward(
-            self,
-            hidden_states=hidden_states,
-            router_logits=router_logits,
-        )
+        if self._shared_experts is None:
+            fused_out = AscendFusedMoE.forward(
+                self,
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+            )
+            shared_out = None
+        else:
+            shared_out, fused_out = AscendFusedMoE.forward(
+                self,
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+            )
         return shared_out, fused_out
 
     def forward_impl(self, hidden_states: torch.Tensor,
@@ -506,7 +514,10 @@ def forward_impl(self, hidden_states: torch.Tensor,
             # Use a separate stream to run shared experts.
             # Note that currently we only support calculations in separate streams with aclgraph.
             # Communication operations in another stream might cause unknown errors.
-            shared_out = self._shared_experts(hidden_states)
+            if self._shared_experts is None:
+                shared_out = None
+            else:
+                shared_out = self._shared_experts(hidden_states)
 
         fused_output = AscendFusedMoE.forward_impl(
             self,
@@ -521,6 +532,9 @@ def forward_impl(self, hidden_states: torch.Tensor,
         forward_context = get_forward_context()
         moe_comm_type = forward_context.moe_comm_type
         if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2} \
-                and not shared_expert_dp_enabled():
+                and not shared_expert_dp_enabled() and shared_out is not None:
             shared_out = tensor_model_parallel_all_reduce(shared_out)
-        return shared_out, fused_output
+        if shared_out is None:
+            return fused_output
+        else:
+            return shared_out, fused_output
diff --git a/vllm_ascend/ops/fused_moe/moe_mlp.py b/vllm_ascend/ops/fused_moe/moe_mlp.py
@@ -127,9 +127,8 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
             if quantized_hidden_states is not None:
                 dispose_tensor(quantized_hidden_states)
             # act_fn: swiglu
-            group_diff = torch.diff(group_list, dim=0)
-            new_group = torch.cat([group_list[0].unsqueeze(0), group_diff],
-                                  dim=0)
+            group_diff = torch.diff(group_list)
+            new_group = torch.cat([group_diff[0].unsqueeze(0), group_diff], dim=0)
             hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
                 x=hidden_states,
                 weight_scale=w1_scale,
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
@@ -138,3 +138,4 @@
 #    Future Plan:
 #       Remove this patch when adapted vllm version contains the above PR.
 #
+from vllm_ascend.patch.worker import patch_deepseekv3
diff --git a/vllm_ascend/patch/worker/patch_deepseekv3.py b/vllm_ascend/patch/worker/patch_deepseekv3.py

Original file line number	Diff line number	Diff line change
`@@ -138,3 +138,4 @@`
`138`	`138`	`# Future Plan:`
`139`	`139`	`# Remove this patch when adapted vllm version contains the above PR.`
`140`	`140`	`#`
	`141`	`+from vllm_ascend.patch.worker import patch_deepseekv3`