fix PR checks: faster balance algo, more robust balance management, from env control to option control, better logger info control

shihaobai · shihaobai · commit d1d32b278467 · 2025-07-07T23:49:34.000+08:00
diff --git a/lightllm/common/fused_moe/grouped_fused_moe_ep.py b/lightllm/common/fused_moe/grouped_fused_moe_ep.py
@@ -5,6 +5,7 @@
 import triton.language as tl
 from typing import Any, Callable, Dict, Optional, Tuple
 import torch.distributed as dist
+from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.utils.log_utils import init_logger
 from lightllm.common.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
 from lightllm.common.fused_moe.moe_silu_and_mul_mix_quant_ep import silu_and_mul_masked_post_quant_fwd
@@ -143,10 +144,11 @@ def fused_experts_impl(
         # scatter
         all_tokens = sum(num_recv_tokens_per_expert_list)  # calcu padding all nums.
 
-        # 用于调试负载平衡的重要日志
-        #rank=dist.get_rank()
-        #logger.info(f"prefill, [{rank}],
-        #        all_tokens = {all_tokens}, num_recv_tokens_per_expert_list: {num_recv_tokens_per_expert_list}")
+        if get_env_start_args().enable_ep_fake_balance:
+            rank=dist.get_rank()
+            if rank == 0:
+                logger.info(f"prefill, [{rank}], all_tokens = {all_tokens}, "
+                            f"num_recv_tokens_per_expert_list: {num_recv_tokens_per_expert_list}")
 
         # gather_out shape [recive_num_tokens, hidden]
         gather_out = torch.empty_like(recv_x[0], device=hidden_states.device, dtype=hidden_states.dtype)
@@ -226,12 +228,14 @@ def fused_experts_impl(
             return_recv_hook=False,
         )
 
-        # 用于调试负载平衡的重要日志
         # NOTE: when decoding graph is open, we can not call logger. Thus it can only be used when --disable_cudagraph
-        #rank=dist.get_rank()
-        #all_tokens = sum(masked_m)
-        #logger.info(f"decode, [{rank}],
-        #        all_tokens = {all_tokens}, expected_m = {expected_m}, num_recv_tokens_per_expert: {masked_m}")
+        args = get_env_start_args()
+        if args.enable_ep_fake_balance and args.disable_cudagraph:
+            rank=dist.get_rank()
+            all_tokens = sum(masked_m)
+            if rank == 0:
+                logger.info(f"decode, [{rank}], all_tokens = {all_tokens}, "
+                            f"expected_m = {expected_m}, num_recv_tokens_per_expert: {masked_m}")
 
         # deepgemm
         gemm_out_b = masked_group_gemm(recv_x, masked_m, hidden_states.dtype, w1, w1_scale, w2, w2_scale, expected_m)
diff --git a/lightllm/common/fused_moe/topk_select.py b/lightllm/common/fused_moe/topk_select.py
@@ -21,10 +21,15 @@
 import torch
 from lightllm.utils.sgl_utils import sgl_ops
 from lightllm.utils.light_utils import light_ops
+from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.utils.balance_utils import BalancedTensor
 from typing import Callable, List, Optional, Tuple
 from lightllm.common.fused_moe.softmax_topk import softmax_topk
 
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
 use_cuda_grouped_topk = os.getenv("LIGHTLLM_CUDA_GROUPED_TOPK", "False").upper() in ["ON", "TRUE", "1"]
 
 
@@ -229,10 +234,10 @@ def select_experts(
         )
 
     # EP fake负载平衡开关
-    if os.environ.get("EP_FAKE_BALANCE_ENABLED") == "true":
-        M, _ = hidden_states.shape
-        balanced_tensor_collection = BalancedTensor()
-        balance_topk_ids = balanced_tensor_collection.get_balance_topk_ids(M)
+    if get_env_start_args().enable_ep_fake_balance:
+        num_tokens, num_experts = router_logits.shape
+        balanced_tensor_collection = BalancedTensor(num_experts=num_experts, num_selected=top_k)
+        balance_topk_ids = balanced_tensor_collection.get_balance_topk_ids(num_tokens)
         topk_ids.copy_(balance_topk_ids)
 
     return topk_weights, topk_ids
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -333,6 +333,9 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--enable_monitor_auth", action="store_true", help="Whether to open authentication for push_gateway"
     )
+
+    parser.add_argument("--enable_ep_fake_balance", action="store_true", help="Enable the fake balance of the EP mode")
+
     parser.add_argument("--disable_cudagraph", action="store_true", help="Disable the cudagraph of the decoding stage")
 
     parser.add_argument(
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
@@ -76,6 +76,7 @@ class StartArgs:
     visual_dp: int = field(default=1)
     visual_nccl_ports: List[int] = field(default_factory=lambda: [29500])
     enable_monitor_auth: bool = field(default=False)
+    enable_ep_fake_balance: bool = field(default=False)
     disable_cudagraph: bool = field(default=False)
     graph_max_batch_size: int = field(default=256)
     graph_split_batch_size: int = field(default=32)
diff --git a/lightllm/utils/balance_utils.py b/lightllm/utils/balance_utils.py
@@ -3,15 +3,21 @@
 
 import threading
 
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
 def singleton_threadsafe(cls):
     instances = {}
     lock = threading.Lock()
-
     def get_instance(*args, **kwargs):
+        # A key that includes the arguments is needed for parameter-dependent singletons.
+        # Using a tuple of args and a frozenset of kwargs items makes it hashable.
+        key = (cls, args, frozenset(kwargs.items()))
         with lock:
-            if cls not in instances:
-                instances[cls] = cls(*args, **kwargs)
-            return instances[cls]
+            if key not in instances:
+                instances[key] = cls(*args, **kwargs)
+            return instances[key]
     return get_instance
 
 @singleton_threadsafe
@@ -21,43 +27,38 @@ def __init__(self, num_experts=256, num_selected=8):
         self.num_experts = num_experts
         self.num_selected = num_selected
 
-    def generate_balanced_tensor(self, length):
-        # 初始化一个 length * 8 的全零张量，放置在 GPU 上
-        tensor = torch.zeros((length, self.num_selected), dtype=torch.int, device='cuda')
-        # 初始化每个专家的负载计数
+    def gemini_generate_balanced_tensor(self, length):
+        # A more performant way to generate a balanced tensor for expert selection.
+        tensor = torch.empty((length, self.num_selected), dtype=torch.int, device='cuda')
         expert_load = torch.zeros(self.num_experts, dtype=torch.int, device='cuda')
 
+        expert_indices = torch.arange(self.num_experts, device='cuda')
+
         for i in range(length):
-            available_experts = torch.arange(self.num_experts, device='cuda')
-            selected = []
-            for _ in range(self.num_selected):
-                # 计算每个可用专家的当前负载
-                current_load = expert_load[available_experts]
-                # 选择负载最小的专家
-                min_load_indices = torch.where(current_load == current_load.min())[0]
-                if len(min_load_indices) > 1:
-                    # 如果有多个负载最小的专家，随机选择一个
-                    chosen_index = torch.randint(0, len(min_load_indices), (1,), device='cuda').item()
-                    chosen_expert_index = min_load_indices[chosen_index]
-                else:
-                    chosen_expert_index = min_load_indices[0]
-                chosen_expert = available_experts[chosen_expert_index]
-                selected.append(chosen_expert)
-                # 从可用专家列表中移除已选择的专家
-                available_experts = torch.cat(
-                    [available_experts[:chosen_expert_index], available_experts[chosen_expert_index + 1:]])
-                # 更新该专家的负载
-                expert_load[chosen_expert] += 1
-            tensor[i] = torch.tensor(selected, dtype=torch.int, device='cuda')
+            # To break ties randomly when loads are equal, we can shuffle indices
+            # of experts with the same load. A simple way is to shuffle all
+            # indices and then sort by load.
+            shuffled_indices = expert_indices[torch.randperm(self.num_experts, device='cuda')]
+            sorted_shuffled_indices = shuffled_indices[torch.argsort(expert_load[shuffled_indices])]
+
+            # Select the top `num_selected` experts with the lowest load
+            selected_experts = sorted_shuffled_indices[:self.num_selected]
+
+            tensor[i] = selected_experts
+
+            # Update loads for the selected experts using an efficient scatter_add
+            expert_load.scatter_add_(0, selected_experts, torch.ones_like(selected_experts, dtype=torch.int))
+
         return tensor
 
-    def get_balance_topk_ids(self, length):
-        if self.balanced_tensors.get(length) is not None:
-            #print("find length ", length)
-            return self.balanced_tensors[length]
+
+    def get_balance_topk_ids(self, num_tokens):
+        if self.balanced_tensors.get(num_tokens) is not None:
+            #logger.info(f"find balanced tensor for num_tokens={num_tokens}")
+            return self.balanced_tensors[num_tokens]
         else:
-            #print("generate length ", length)
-            tensor = self.generate_balanced_tensor(length)
-            self.balanced_tensors[length] = tensor
+            #logger.info(f"generate balanced tensor for num_tokens={num_tokens}")
+            tensor = self.generate_balanced_tensor(num_tokens)
+            self.balanced_tensors[num_tokens] = tensor
             return tensor
 

Original file line number	Diff line number	Diff line change
`@@ -333,6 +333,9 @@ def make_argument_parser() -> argparse.ArgumentParser:`
`333`	`333`	`parser.add_argument(`
`334`	`334`	`"--enable_monitor_auth", action="store_true", help="Whether to open authentication for push_gateway"`
`335`	`335`	`)`
	`336`	`+`
	`337`	`+ parser.add_argument("--enable_ep_fake_balance", action="store_true", help="Enable the fake balance of the EP mode")`
	`338`	`+`
`336`	`339`	`parser.add_argument("--disable_cudagraph", action="store_true", help="Disable the cudagraph of the decoding stage")`
`337`	`340`
`338`	`341`	`parser.add_argument(`