add ep fake balance

shihaobai · shihaobai · commit 88ca5a0023c0 · 2025-07-07T17:44:22.000+08:00
diff --git a/lightllm/common/fused_moe/grouped_fused_moe_ep.py b/lightllm/common/fused_moe/grouped_fused_moe_ep.py
@@ -142,6 +142,11 @@ def fused_experts_impl(
 
         # scatter
         all_tokens = sum(num_recv_tokens_per_expert_list)  # calcu padding all nums.
+
+        # 用于调试负载平衡的重要日志
+        #rank=dist.get_rank()
+        #logger.info(f"prefill, [{rank}], all_tokens = {all_tokens}, num_recv_tokens_per_expert_list: {num_recv_tokens_per_expert_list}")
+
         # gather_out shape [recive_num_tokens, hidden]
         gather_out = torch.empty_like(recv_x[0], device=hidden_states.device, dtype=hidden_states.dtype)
         if all_tokens > 0:
@@ -219,6 +224,13 @@ def fused_experts_impl(
             async_finish=False,
             return_recv_hook=False,
         )
+
+        # 用于调试负载平衡的重要日志
+        # when decoding graph is open, we can not call logger. --profile can close cuda graph
+        #rank=dist.get_rank()
+        #all_tokens = sum(masked_m)
+        #logger.info(f"decode, [{rank}], all_tokens = {all_tokens}, expected_m = {expected_m}, num_recv_tokens_per_expert: {masked_m}")
+
         # deepgemm
         gemm_out_b = masked_group_gemm(recv_x, masked_m, hidden_states.dtype, w1, w1_scale, w2, w2_scale, expected_m)
         # low latency combine
diff --git a/lightllm/common/fused_moe/topk_select.py b/lightllm/common/fused_moe/topk_select.py
@@ -21,6 +21,7 @@
 import torch
 from lightllm.utils.sgl_utils import sgl_ops
 from lightllm.utils.light_utils import light_ops
+from lightllm.utils.balance_utils import BalancedTensor
 from typing import Callable, List, Optional, Tuple
 from lightllm.common.fused_moe.softmax_topk import softmax_topk
 
@@ -227,4 +228,11 @@ def select_experts(
             hidden_states=hidden_states, gating_output=router_logits, topk=top_k, renormalize=renormalize
         )
 
+    # EP fake负载平衡开关
+    if os.environ.get("EP_FAKE_BALANCE_ENABLED") == "true":
+        M, _ = hidden_states.shape
+        balanced_tensor_collection = BalancedTensor()
+        balance_topk_ids = balanced_tensor_collection.get_balance_topk_ids(M)
+        topk_ids.copy_(balance_topk_ids)
+
     return topk_weights, topk_ids
diff --git a/lightllm/utils/balance_utils.py b/lightllm/utils/balance_utils.py
@@ -0,0 +1,63 @@
+import torch
+import os
+
+import threading
+
+def singleton_threadsafe(cls):
+    instances = {}
+    lock = threading.Lock()
+
+    def get_instance(*args, **kwargs):
+        with lock:
+            if cls not in instances:
+                instances[cls] = cls(*args, **kwargs)
+            return instances[cls]
+    return get_instance
+
+@singleton_threadsafe
+class BalancedTensor:
+    def __init__(self, num_experts=256, num_selected=8):
+        self.balanced_tensors = {}
+        self.num_experts = num_experts
+        self.num_selected = num_selected
+
+    def generate_balanced_tensor(self, length):
+        # 初始化一个 length * 8 的全零张量，放置在 GPU 上
+        tensor = torch.zeros((length, self.num_selected), dtype=torch.int, device='cuda')
+        # 初始化每个专家的负载计数
+        expert_load = torch.zeros(self.num_experts, dtype=torch.int, device='cuda')
+
+        for i in range(length):
+            available_experts = torch.arange(self.num_experts, device='cuda')
+            selected = []
+            for _ in range(self.num_selected):
+                # 计算每个可用专家的当前负载
+                current_load = expert_load[available_experts]
+                # 选择负载最小的专家
+                min_load_indices = torch.where(current_load == current_load.min())[0]
+                if len(min_load_indices) > 1:
+                    # 如果有多个负载最小的专家，随机选择一个
+                    chosen_index = torch.randint(0, len(min_load_indices), (1,), device='cuda').item()
+                    chosen_expert_index = min_load_indices[chosen_index]
+                else:
+                    chosen_expert_index = min_load_indices[0]
+                chosen_expert = available_experts[chosen_expert_index]
+                selected.append(chosen_expert)
+                # 从可用专家列表中移除已选择的专家
+                available_experts = torch.cat(
+                    [available_experts[:chosen_expert_index], available_experts[chosen_expert_index + 1:]])
+                # 更新该专家的负载
+                expert_load[chosen_expert] += 1
+            tensor[i] = torch.tensor(selected, dtype=torch.int, device='cuda')
+        return tensor
+
+    def get_balance_topk_ids(self, length):
+        if self.balanced_tensors.get(length) is not None:
+            #print("find length ", length)
+            return self.balanced_tensors[length]
+        else:
+            #print("generate length ", length)
+            tensor = self.generate_balanced_tensor(length)
+            self.balanced_tensors[length] = tensor
+            return tensor
+