fix

none · none · commit 94b8db864e34 · 2025-08-27T11:57:55.000Z
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py
@@ -17,6 +17,7 @@
 )
 from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.common.basemodel.triton_kernel.redundancy_topk_ids_repair import redundancy_topk_ids_repair
+from lightllm.utils.envs_utils import is_triton_autotune_enabled
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -353,6 +354,15 @@ def prefilled_group_gemm(
             )
             # gather and local reduce
             ep_gather(gemm_out_b, recv_topk_idx, recv_topk_weights, output_index, gather_out)
+        else:
+            ######################################## warning ##################################################
+            # here is used to match autotune feature, make moe model run same triton kernel in different rank.
+            # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
+            if is_triton_autotune_enabled():
+                _gemm_out_a = torch.zeros((1, N), device=device, dtype=hidden_dtype)
+                _silu_out = torch.zeros((1, N // 2), device=device, dtype=hidden_dtype)
+                silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
+                _gemm_out_a, _silu_out = None, None
 
         return gather_out
 
diff --git a/lightllm/common/fused_moe/grouped_fused_moe_ep.py b/lightllm/common/fused_moe/grouped_fused_moe_ep.py
@@ -192,8 +192,8 @@ def fused_experts_impl(
             # here is used to match autotune feature, make moe model run same triton kernel in different rank.
             # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
             if is_triton_autotune_enabled():
-                _gemm_out_a = torch.empty((1, N), device=hidden_states.device, dtype=hidden_states.dtype)
-                _silu_out = torch.empty((1, N // 2), device=hidden_states.device, dtype=hidden_states.dtype)
+                _gemm_out_a = torch.zeros((1, N), device=hidden_states.device, dtype=hidden_states.dtype)
+                _silu_out = torch.zeros((1, N // 2), device=hidden_states.device, dtype=hidden_states.dtype)
                 silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
                 _gemm_out_a, _silu_out = None, None