fix

none · none · commit f7bd1c5ab195 · 2025-08-27T11:46:39.000Z
diff --git a/lightllm/common/fused_moe/grouped_fused_moe_ep.py b/lightllm/common/fused_moe/grouped_fused_moe_ep.py
@@ -14,6 +14,7 @@
 )
 from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
+from lightllm.utils.envs_utils import is_triton_autotune_enabled
 import numpy as np
 
 logger = init_logger(__name__)
@@ -186,6 +187,16 @@ def fused_experts_impl(
 
             # gather and local reduce
             ep_gather(gemm_out_b, recv_topk_idx, recv_topk_weights, output_index, gather_out)
+        else:
+            ######################################## warning ##################################################
+            # here is used to match autotune feature, make moe model run same triton kernel in different rank.
+            # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
+            if is_triton_autotune_enabled():
+                _gemm_out_a = torch.empty((1, N), device=hidden_states.device, dtype=hidden_states.dtype)
+                _silu_out = torch.empty((1, N // 2), device=hidden_states.device, dtype=hidden_states.dtype)
+                silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
+                _gemm_out_a, _silu_out = None, None
+
         # normal combine
         combined_x, _, event = buffer.combine(
             gather_out,