fix

flyinglandlord · flyinglandlord · commit ece3d13798a2 · 2025-09-28T13:56:32.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/gpt_oss_fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/gpt_oss_fused_moe_weight_tp.py
@@ -116,26 +116,13 @@ def load_hf_weights(self, weights):
             w2_bias = weights[self._down_bias_name]
             self.w2_bias = self._cuda(w2_bias)
 
-    # Keep torch version code for reference
-    def _torch_router(self, router_logits, top_k, layer_num):
+    def router(self, router_logits, top_k):
         router_top_value, router_indices = torch.topk(router_logits, top_k, dim=-1)
         router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
-        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
-        return router_scores, router_indices
+        return router_top_value, router_indices
 
     def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
-        from lightllm.common.fused_moe.topk_select import select_experts
-
-        topk_weights, topk_ids = select_experts(
-            hidden_states=input_tensor,
-            router_logits=router_logits,
-            correction_bias=self.e_score_correction_bias,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-        )
+        topk_weights, topk_ids = self.router(router_logits, top_k)
 
         w1, w1_scale = self.w1
         w2, w2_scale = self.w2
@@ -161,29 +148,6 @@ def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_t
         )
         return output_tensor
 
-    def _torch_experts(self, hidden_states: torch.Tensor, routing_weights, layer_num):
-        w1, w1_scale = self.w1
-        w2, w2_scale = self.w2
-        assert w1_scale is None and w2_scale is None, "For now, we do not support quantized weight in GPT-OSS."
-
-        batch_size = hidden_states.shape[0]
-        hidden_states = hidden_states.reshape(-1, self.hidden_size)  # (num_tokens, hidden_size)
-        num_experts = routing_weights.shape[1]
-
-        hidden_states = hidden_states.repeat(num_experts, 1)
-        hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
-        gate_up = torch.bmm(hidden_states, w1.transpose(1, 2)) + self.w1_bias[..., None, :]
-        gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-        gate = gate.clamp(min=None, max=self.limit)
-        up = up.clamp(min=-self.limit, max=self.limit)
-        glu = gate * torch.sigmoid(gate * self.alpha)
-        next_states = torch.bmm(((up + 1) * glu), w2.transpose(1, 2))
-        next_states = next_states + self.w2_bias[..., None, :] / self.tp_world_size_
-        next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
-        next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
-        next_states = next_states.sum(dim=0)
-        return next_states
-
     def _convert_moe_packed_tensors(
         self,
         blocks,
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -394,6 +394,9 @@ def grouped_matmul_kernel(
     weight_stride_0,
     weight_stride_1,
     weight_stride_2,
+    bias_ptr,  # [expert_num, n]
+    bias_stride_0,
+    bias_stride_1,
     expert_to_weights_ptr,  # [expert_num, token_num * topk]
     expert_to_weights_stride0,
     expert_to_weights_stride1,
@@ -418,9 +421,6 @@ def grouped_matmul_kernel(
     MUL_ROUTED_WEIGHT: tl.constexpr = False,
     NEED_K_MASK: tl.constexpr = True,
     NEED_TRANS: tl.constexpr = False,
-    # Bias support
-    bias_ptr=None,  # [expert_num, n]
-    bias_stride_0=0,
     ADD_BIAS: tl.constexpr = False,
 ):
     pid = tl.program_id(0)
@@ -535,7 +535,7 @@ def grouped_matmul_kernel(
     if ADD_BIAS:
         offs_bn_bias = offs_bn  # [BLOCK_SIZE_N]
         bias_ptrs = bias_ptr + expert_id * bias_stride_0 + offs_bn_bias
-        bias_vals = tl.load(bias_ptrs, mask=offs_bn_bias < n, other=0.0)  # [BLOCK_SIZE_N]
+        bias_vals = tl.load(bias_ptrs)  # [BLOCK_SIZE_N]
         accumulator += bias_vals[None, :]  # broadcast across M dimension
 
     if MUL_ROUTED_WEIGHT:
@@ -728,6 +728,9 @@ def grouped_matmul(
         expert_weights.stride(0),
         expert_weights.stride(1),
         expert_weights.stride(2),
+        bias,
+        bias.stride(0) if bias is not None else 0,
+        bias.stride(1) if bias is not None and bias.ndim >= 2 else 0,
         expert_to_weights,
         expert_to_weights.stride(0),
         expert_to_weights.stride(1),
@@ -753,8 +756,6 @@ def grouped_matmul(
         num_warps=num_warps,
         num_stages=num_stages,
         ADD_BIAS=bias is not None,
-        bias_ptr=bias,
-        bias_stride_0=bias.stride(0) if bias is not None else 0,
     )
     return (mblocks_to_expert_id, mblocks_to_m_index, BLOCK_SIZE_M)
 
diff --git a/lightllm/common/fused_moe/moe_silu_and_mul.py b/lightllm/common/fused_moe/moe_silu_and_mul.py
@@ -16,13 +16,14 @@ def _silu_and_mul_kernel_fast(
     stride_output_n,
     size_m,
     size_n,
+    limit: tl.constexpr,
+    alpha: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
     NUM_STAGES: tl.constexpr,
     NEED_MASK: tl.constexpr,
     layout: tl.constexpr = "blocked",  # "blocked" or "interleaved"
-    limit=None,
-    alpha=None,
+    USE_LIMIT_AND_ALPHA: tl.constexpr = False,
 ):
     stride_input_m = tl.cast(stride_input_m, dtype=tl.int64)
     stride_output_m = tl.cast(stride_output_m, dtype=tl.int64)
@@ -63,27 +64,23 @@ def _silu_and_mul_kernel_fast(
             other=other,
         ).to(tl.float32)
 
-        if limit is None and alpha is None:
-            gate = gate / (1 + tl.exp(-gate))
+        if USE_LIMIT_AND_ALPHA:
+            gate = tl.minimum(gate, limit)
+            up = tl.minimum(tl.maximum(up, -limit), limit)
+            gate = 1 / (1 + tl.exp(-gate * alpha)) * gate
             gate = gate.to(input_ptr.dtype.element_ty)
-
             tl.store(
                 output_ptr + out_offsets,
-                up * gate,
+                (up + 1) * gate,
                 mask=mask,
             )
         else:
-            # clamp up and gate
-            if limit is not None:
-                gate = tl.minimum(gate, limit)
-                up = tl.minimum(tl.maximum(up, -limit), limit)
-            if alpha is None:
-                alpha = 1.0
-            gate = 1 / (1 + tl.exp(-gate * alpha)) * gate
+            gate = gate / (1 + tl.exp(-gate))
             gate = gate.to(input_ptr.dtype.element_ty)
+
             tl.store(
                 output_ptr + out_offsets,
-                (up + 1) * gate,
+                up * gate,
                 mask=mask,
             )
 
@@ -114,6 +111,7 @@ def silu_and_mul_fwd(
 ):
     assert input.is_contiguous()
     assert output.is_contiguous()
+    assert (limit is None and alpha is None) or (limit is not None and alpha is not None)
 
     stride_input_m = input.stride(0)
     stride_input_n = input.stride(1)
@@ -132,6 +130,7 @@ def silu_and_mul_fwd(
     # limit the grid size to avoid the invalid argument error of triton
     while triton.cdiv(size_m, BLOCK_M) > 8192:
         BLOCK_M *= 2
+    USE_LIMIT_AND_ALPHA = limit is not None and alpha is not None
 
     grid = (
         triton.cdiv(size_n, BLOCK_N),
@@ -147,13 +146,14 @@ def silu_and_mul_fwd(
         stride_output_n=stride_output_n,
         size_m=size_m,
         size_n=size_n,
+        limit=limit,
+        alpha=alpha,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
         NUM_STAGES=NUM_STAGES,
         NEED_MASK=NEED_MASK,
         num_warps=num_warps,
         layout=layout,
-        limit=limit,
-        alpha=alpha,
+        USE_LIMIT_AND_ALPHA=USE_LIMIT_AND_ALPHA,
     )
     return
diff --git a/lightllm/models/gpt_oss/layer_infer/transformer_layer_infer.py b/lightllm/models/gpt_oss/layer_infer/transformer_layer_infer.py
@@ -51,14 +51,6 @@ def _gpt_oss_rmsnorm(self, hidden_states, weight, eps=1e-6):
         hidden_states = hidden_states * torch.rsqrt(variance + eps)
         return (weight * hidden_states).to(input_dtype)  # main diff with Llama
 
-    def _torch_router(self, hidden_states, layer_weight: GptOssTransformerLayerWeight):
-        hidden_states = hidden_states.reshape(-1, self.hidden_size)
-        router_logits = layer_weight.moe_gate.mm(hidden_states)
-        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
-        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
-        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
-        return router_scores, router_indices
-
     def _ffn(
         self, input, infer_state: FlashAttentionStateInfo, layer_weight: GptOssTransformerLayerWeight
     ) -> torch.Tensor: