ModelTC
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 3 additions & 0 deletions b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py‎
Lines changed: 114 additions & 31 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py‎
Lines changed: 114 additions & 31 deletions
@@ -577,6 +577,9 @@ def _check_max_len_infer(self):
             logger.info("disable_check_max_len_infer is true")
             return
 
+        # 做一次 同步
+        torch.distributed.barrier()
+
         # 模拟最大长度进行 prefill，观察是否出现 OOM
         try:
             logger.info("begin check max_len infer")
 
@@ -9,11 +9,14 @@
 from lightllm.distributed import dist_group_manager
 from lightllm.common.fused_moe.topk_select import select_experts
 from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
+from lightllm.utils.envs_utils import get_redundancy_expert_ids, get_redundancy_expert_num
+from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import (
     per_token_group_quant_fp8,
     tma_align_input_scale,
 )
 from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
+from lightllm.common.basemodel.triton_kernel.redundancy_topk_ids_repair import redundancy_topk_ids_repair
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -40,6 +43,7 @@ def __init__(
     ) -> None:
         super().__init__()
 
+        self.layer_num = layer_num
         self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
         self.quantized_weight = quant_cfg.quantized_weight
         if self.quant_method is not None:
@@ -60,15 +64,26 @@ def __init__(
 
         global_world_size = get_global_world_size()
         self.global_rank_ = get_global_rank()
+        self.redundancy_expert_num = get_redundancy_expert_num()
+        self.redundancy_expert_ids = get_redundancy_expert_ids(layer_num)
+        logger.info(
+            f"global_rank {self.global_rank_} layerindex {layer_num} redundancy_expertids: {self.redundancy_expert_ids}"
+        )
+        self.redundancy_expert_ids_tensor = torch.tensor(self.redundancy_expert_ids, dtype=torch.int64, device="cuda")
+        self.routed_expert_counter_tensor = torch.zeros((self.n_routed_experts,), dtype=torch.int64, device="cuda")
+        self.total_expert_num_contain_redundancy = (
+            self.n_routed_experts + self.redundancy_expert_num * global_world_size
+        )
         assert self.n_routed_experts % global_world_size == 0
         self.ep_n_routed_experts = self.n_routed_experts // global_world_size
-        self.experts_up_projs = [None] * self.ep_n_routed_experts
-        self.experts_gate_projs = [None] * self.ep_n_routed_experts
-        self.experts_up_proj_scales = [None] * self.ep_n_routed_experts
-        self.experts_gate_proj_scales = [None] * self.ep_n_routed_experts
+        ep_load_expert_num = self.ep_n_routed_experts + self.redundancy_expert_num
+        self.experts_up_projs = [None] * ep_load_expert_num
+        self.experts_gate_projs = [None] * ep_load_expert_num
+        self.experts_up_proj_scales = [None] * ep_load_expert_num
+        self.experts_gate_proj_scales = [None] * ep_load_expert_num
         self.e_score_correction_bias = None
-        self.w2_list = [None] * self.ep_n_routed_experts
-        self.w2_scale_list = [None] * self.ep_n_routed_experts
+        self.w2_list = [None] * ep_load_expert_num
+        self.w2_scale_list = [None] * ep_load_expert_num
         self.scoring_func = network_config["scoring_func"]
         self.w1 = [None, None]  # weight, weight_scale
         self.w2 = [None, None]  # weight, weight_scale
@@ -84,6 +99,9 @@ def __init__(
         self.lock = threading.Lock()
         # init buffer
 
+        # auto update redundancy expert vars
+        self.auto_update_redundancy_expert: bool = get_env_start_args().auto_update_redundancy_expert
+
     def experts(
         self,
         input_tensor,
@@ -106,6 +124,17 @@ def experts(
             num_expert_group=num_expert_group,
             scoring_func=self.scoring_func,
         )
+
+        if self.redundancy_expert_num > 0:
+            redundancy_topk_ids_repair(
+                topk_ids=topk_ids,
+                redundancy_expert_ids=self.redundancy_expert_ids_tensor,
+                ep_expert_num=self.ep_n_routed_experts,
+                global_rank=self.global_rank_,
+                expert_counter=self.routed_expert_counter_tensor,
+                enable_counter=self.auto_update_redundancy_expert,
+            )
+
         w1, w1_scale = self.w1
         w2, w2_scale = self.w2
         return fused_experts_impl(
@@ -114,7 +143,7 @@ def experts(
             w2=w2,
             topk_weights=topk_weights,
             topk_idx=topk_ids.to(torch.long),
-            num_experts=self.n_routed_experts,  # number of all experts
+            num_experts=self.total_expert_num_contain_redundancy,  # number of all experts contain redundancy
             buffer=dist_group_manager.ep_buffer,
             is_prefill=is_prefill,
             use_fp8_w8a8=self.use_fp8_w8a8,
@@ -142,13 +171,24 @@ def low_latency_dispatch(
             num_expert_group=self.n_group,
             scoring_func=self.scoring_func,
         )
+
+        if self.redundancy_expert_num > 0:
+            redundancy_topk_ids_repair(
+                topk_ids=topk_idx,
+                redundancy_expert_ids=self.redundancy_expert_ids_tensor,
+                ep_expert_num=self.ep_n_routed_experts,
+                global_rank=self.global_rank_,
+                expert_counter=self.routed_expert_counter_tensor,
+                enable_counter=self.auto_update_redundancy_expert,
+            )
+
         topk_idx = topk_idx.to(torch.long)
         num_max_dispatch_tokens_per_rank = get_deepep_num_max_dispatch_tokens_per_rank()
         recv_x, masked_m, handle, event, hook = dist_group_manager.ep_buffer.low_latency_dispatch(
             hidden_states,
             topk_idx,
             num_max_dispatch_tokens_per_rank,
-            self.n_routed_experts,
+            self.total_expert_num_contain_redundancy,
             use_fp8=self.use_fp8_w8a8,
             async_finish=False,
             return_recv_hook=True,
@@ -171,6 +211,15 @@ def select_experts_and_quant_input(
             num_expert_group=self.n_group,
             scoring_func=self.scoring_func,
         )
+        if self.redundancy_expert_num > 0:
+            redundancy_topk_ids_repair(
+                topk_ids=topk_idx,
+                redundancy_expert_ids=self.redundancy_expert_ids_tensor,
+                ep_expert_num=self.ep_n_routed_experts,
+                global_rank=self.global_rank_,
+                expert_counter=self.routed_expert_counter_tensor,
+                enable_counter=self.auto_update_redundancy_expert,
+            )
         M, K = hidden_states.shape
         w1, w1_scale = self.w1
         block_size_k = 0
@@ -190,7 +239,6 @@ def dispatch(
         overlap_event: Optional[Any] = None,
     ):
         buffer = dist_group_manager.ep_buffer
-        num_experts = self.n_routed_experts
         # get_dispatch_layout
         (
             num_tokens_per_rank,
@@ -199,7 +247,11 @@ def dispatch(
             is_token_in_rank,
             previous_event,
         ) = buffer.get_dispatch_layout(
-            topk_idx, num_experts, previous_event=overlap_event, async_finish=True, allocate_on_comm_stream=True
+            topk_idx,
+            self.total_expert_num_contain_redundancy,
+            previous_event=overlap_event,
+            async_finish=True,
+            allocate_on_comm_stream=True,
         )
         recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event = buffer.dispatch(
             qinput_tensor,
@@ -342,16 +394,18 @@ def _fuse(self):
                 and None not in self.experts_gate_projs
                 and None not in self.w2_list
             ):
-                w1_list = []
-                for i_experts in range(self.ep_n_routed_experts):
-                    expert_gate_up_proj = torch.cat(
-                        [self.experts_gate_projs[i_experts], self.experts_up_projs[i_experts]], dim=0
-                    )
-                    expert_gate_up_proj = expert_gate_up_proj
-                    w1_list.append(expert_gate_up_proj)
-
-                inter_shape, hidden_size = w1_list[0].shape[0], w1_list[0].shape[1]
-                w1 = torch._utils._flatten_dense_tensors(w1_list).view(len(w1_list), inter_shape, hidden_size)
+                gate_out_dim, gate_in_dim = self.experts_gate_projs[0].shape
+                up_out_dim, up_in_dim = self.experts_up_projs[0].shape
+                assert gate_in_dim == up_in_dim
+                dtype = self.experts_gate_projs[0].dtype
+                total_expert_num = self.ep_n_routed_experts + self.redundancy_expert_num
+
+                w1 = torch.empty((total_expert_num, gate_out_dim + up_out_dim, gate_in_dim), dtype=dtype, device="cpu")
+
+                for i_experts in range(self.ep_n_routed_experts + self.redundancy_expert_num):
+                    w1[i_experts, 0:gate_out_dim:, :] = self.experts_gate_projs[i_experts]
+                    w1[i_experts, gate_out_dim:, :] = self.experts_up_projs[i_experts]
+
                 inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
                 w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
                 if not self.quantized_weight and self.quant_method is not None:
@@ -372,17 +426,20 @@ def _fuse_weight_scale(self):
                 and None not in self.experts_gate_proj_scales
                 and None not in self.w2_scale_list
             ):
-                w1_scale_list = []
-                for i_experts in range(self.ep_n_routed_experts):
-                    expert_gate_up_proj_scale = torch.cat(
-                        [self.experts_gate_proj_scales[i_experts], self.experts_up_proj_scales[i_experts]], dim=0
-                    )
-                    w1_scale_list.append(expert_gate_up_proj_scale)
-
-                inter_shape, hidden_size = w1_scale_list[0].shape[0], w1_scale_list[0].shape[1]
-                w1_scale = torch._utils._flatten_dense_tensors(w1_scale_list).view(
-                    len(w1_scale_list), inter_shape, hidden_size
+                gate_out_dim, gate_in_dim = self.experts_gate_proj_scales[0].shape
+                up_out_dim, up_in_dim = self.experts_up_proj_scales[0].shape
+                assert gate_in_dim == up_in_dim
+                dtype = self.experts_gate_proj_scales[0].dtype
+                total_expert_num = self.ep_n_routed_experts + self.redundancy_expert_num
+
+                w1_scale = torch.empty(
+                    (total_expert_num, gate_out_dim + up_out_dim, gate_in_dim), dtype=dtype, device="cpu"
                 )
+
+                for i_experts in range(self.ep_n_routed_experts + self.redundancy_expert_num):
+                    w1_scale[i_experts, 0:gate_out_dim:, :] = self.experts_gate_proj_scales[i_experts]
+                    w1_scale[i_experts, gate_out_dim:, :] = self.experts_up_proj_scales[i_experts]
+
                 inter_shape, hidden_size = self.w2_scale_list[0].shape[0], self.w2_scale_list[0].shape[1]
                 w2_scale = torch._utils._flatten_dense_tensors(self.w2_scale_list).view(
                     len(self.w2_scale_list), inter_shape, hidden_size
@@ -411,7 +468,20 @@ def load_hf_weights(self, weights):
             if w2_weight in weights:
                 self.w2_list[i_experts_ep] = weights[w2_weight]
 
-        if self.quant_method is not None:
+        # Load weight parameters for redundant experts
+        for i, redundant_expert_id in enumerate(self.redundancy_expert_ids):
+            i_experts = redundant_expert_id
+            w1_weight = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.weight"
+            w2_weight = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.weight"
+            w3_weight = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.weight"
+            if w1_weight in weights:
+                self.experts_gate_projs[n_expert_ep + i] = weights[w1_weight]
+            if w3_weight in weights:
+                self.experts_up_projs[n_expert_ep + i] = weights[w3_weight]
+            if w2_weight in weights:
+                self.w2_list[n_expert_ep + i] = weights[w2_weight]
+
+        if self.quantized_weight:
             self._load_weight_scale(weights)
         self._fuse()
 
@@ -430,6 +500,19 @@ def _load_weight_scale(self, weights: Dict[str, torch.Tensor]) -> None:
             if w2_scale in weights:
                 self.w2_scale_list[i_experts_ep] = weights[w2_scale]
 
+        # Load scale parameters for redundant experts
+        for i, redundant_expert_id in enumerate(self.redundancy_expert_ids):
+            i_experts = redundant_expert_id
+            w1_scale = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.{self.weight_scale_suffix}"
+            w2_scale = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.{self.weight_scale_suffix}"
+            w3_scale = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.{self.weight_scale_suffix}"
+            if w1_scale in weights:
+                self.experts_gate_proj_scales[n_expert_ep + i] = weights[w1_scale]
+            if w3_scale in weights:
+                self.experts_up_proj_scales[n_expert_ep + i] = weights[w3_scale]
+            if w2_scale in weights:
+                self.w2_scale_list[n_expert_ep + i] = weights[w2_scale]
+
     def _cuda(self, cpu_tensor):
         device_id = get_current_device_id()
         if self.quantized_weight: