fix rms_norm

sangchengmeng · sangchengmeng · commit 268c18e1539b · 2025-06-27T14:22:21.000+08:00
diff --git a/lightllm/models/chatglm2/layer_infer/transformer_layer_infer.py b/lightllm/models/chatglm2/layer_infer/transformer_layer_infer.py
@@ -10,7 +10,6 @@
 
 from lightllm.models.chatglm2.triton_kernel.rotary_emb import rotary_emb_fwd
 from lightllm.common.basemodel.triton_kernel.destindex_copy_kv import destindex_copy_kv, destindex_copy_quantize_kv
-from lightllm.models.llama.triton_kernel.rmsnorm import rmsnorm_forward
 
 
 class ChatGLM2TransformerLayerInfer(LlamaTransformerLayerInfer):
diff --git a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
@@ -154,16 +154,16 @@ def _get_qkv(
             q = layer_weight.q_weight_.mm(input)
         else:
             q = layer_weight.q_a_proj_.mm(input)
-            rmsnorm_forward(q, weight=layer_weight.q_a_layernorm_.weight, eps=self.eps_, out=q)
+            q = rmsnorm_forward(q, weight=layer_weight.q_a_layernorm_.weight, eps=self.eps_, use_custom_tensor_mananger=True)
             q = layer_weight.q_b_proj_.mm(q)
         q = q.view(-1, self.tp_q_head_num_, self.qk_nope_head_dim + self.qk_rope_head_dim)
         q_nope, q_rope = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
         layer_weight.kv_a_proj_with_mqa_.mm(input, out=cache_kv.view(-1, self.kv_lora_rank + self.qk_rope_head_dim))
-        rmsnorm_forward(
+        cache_kv[:, :, : self.kv_lora_rank] = rmsnorm_forward(
             cache_kv[:, :, : self.kv_lora_rank],
             weight=layer_weight.kv_a_layernorm_.weight,
             eps=self.eps_,
-            out=cache_kv[:, :, : self.kv_lora_rank],
+            use_custom_tensor_mananger=True
         )
 
         rotary_emb_fwd(
@@ -191,16 +191,16 @@ def _tpsp_get_qkv(
             q = layer_weight.q_weight_.mm(input)
         else:
             q = layer_weight.q_a_proj_.mm(input)
-            rmsnorm_forward(q, weight=layer_weight.q_a_layernorm_.weight, eps=self.eps_, out=q)
+            q = rmsnorm_forward(q, weight=layer_weight.q_a_layernorm_.weight, eps=self.eps_)
             q = layer_weight.q_b_proj_.mm(q)
         q = q.view(-1, self.tp_q_head_num_, self.qk_nope_head_dim + self.qk_rope_head_dim)
         q_nope, q_rope = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
         layer_weight.kv_a_proj_with_mqa_.mm(input, out=cache_kv.view(-1, self.kv_lora_rank + self.qk_rope_head_dim))
-        rmsnorm_forward(
+        cache_kv[:, :, : self.kv_lora_rank] = rmsnorm_forward(
             cache_kv[:, :, : self.kv_lora_rank],
             weight=layer_weight.kv_a_layernorm_.weight,
             eps=self.eps_,
-            out=cache_kv[:, :, : self.kv_lora_rank],
+            use_custom_tensor_mananger=True
         )
         rotary_emb_fwd(
             q_rope,
diff --git a/lightllm/models/deepseek_mtp/layer_infer/pre_layer_infer.py b/lightllm/models/deepseek_mtp/layer_infer/pre_layer_infer.py
@@ -20,8 +20,8 @@ def _mtp_context_forward(
     ):
         tgt_embdings = infer_state.deepseekv3_mtp_draft_input_hiddens
         assert input_embdings.shape[0] == tgt_embdings.shape[0]
-        rmsnorm_forward(input_embdings, weight=layer_weight.enorm_weight_, eps=self.eps_, out=input_embdings)
-        rmsnorm_forward(tgt_embdings, weight=layer_weight.hnorm_weight_, eps=self.eps_, out=tgt_embdings)
+        input_embdings = rmsnorm_forward(input_embdings, weight=layer_weight.enorm_weight_, eps=self.eps_, use_custom_tensor_mananger=True)
+        tgt_embdings = rmsnorm_forward(tgt_embdings, weight=layer_weight.hnorm_weight_, eps=self.eps_, use_custom_tensor_mananger=True)
 
         cat_embdings = torch.cat((input_embdings, tgt_embdings), dim=-1)
 
@@ -36,8 +36,8 @@ def _mtp_token_forward(
     ):
         tgt_embdings = infer_state.deepseekv3_mtp_draft_input_hiddens
         assert input_embdings.shape[0] == tgt_embdings.shape[0]
-        rmsnorm_forward(input_embdings, weight=layer_weight.enorm_weight_, eps=self.eps_, out=input_embdings)
-        rmsnorm_forward(tgt_embdings, weight=layer_weight.hnorm_weight_, eps=self.eps_, out=tgt_embdings)
+        input_embdings = rmsnorm_forward(input_embdings, weight=layer_weight.enorm_weight_, eps=self.eps_, use_custom_tensor_mananger=True)
+        tgt_embdings = rmsnorm_forward(tgt_embdings, weight=layer_weight.hnorm_weight_, eps=self.eps_, use_custom_tensor_mananger=True)
 
         cat_embdings = torch.cat((input_embdings, tgt_embdings), dim=-1)
 
diff --git a/lightllm/models/llama/layer_infer/post_layer_infer.py b/lightllm/models/llama/layer_infer/post_layer_infer.py
@@ -8,12 +8,13 @@
 from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
 from einops import rearrange
 from lightllm.models.llama.infer_struct import LlamaInferStateInfo
-from lightllm.models.vit.triton_kernel.rms_norm_vit import rms_norm
+from lightllm.models.llama.triton_kernel.rmsnorm import rmsnorm_forward
 from lightllm.common.basemodel import PostLayerInferTpl
 from lightllm.utils.infer_utils import mark_cost_time
 from lightllm.distributed.communication_op import all_gather
 
 
+
 class LlamaPostLayerInfer(PostLayerInferTpl):
     """ """
 
@@ -25,7 +26,7 @@ def __init__(self, network_config, mode):
         return
 
     def _norm(self, input, infer_state, layer_weight: LlamaPreAndPostLayerWeight) -> torch.Tensor:
-        return rms_norm(input, layer_weight.final_norm_weight_, eps=self.eps_, use_custom_tensor_mananger=True)
+        return rmsnorm_forward(input, layer_weight.final_norm_weight_, eps=self.eps_, use_custom_tensor_mananger=True)
 
     def _slice_get_last_input(self, input_embdings, infer_state: LlamaInferStateInfo):
 
diff --git a/lightllm/models/llama/layer_infer/transformer_layer_infer.py b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
@@ -1,3 +1,4 @@
+import os
 import torch
 import triton
 import torch.functional as F
@@ -14,7 +15,7 @@
 from lightllm.models.llama.triton_kernel.token_attention_nopad_att1 import token_att_fwd, token_att_fwd_int8k
 from lightllm.models.llama.triton_kernel.token_attention_nopad_softmax import token_softmax_fwd
 from lightllm.models.llama.triton_kernel.token_attention_nopad_reduceV import token_att_fwd2, token_att_fwd2_int8v
-from lightllm.models.vit.triton_kernel.rms_norm_vit import rms_norm
+from lightllm.models.llama.triton_kernel.rmsnorm import rmsnorm_forward
 from lightllm.models.llama.triton_kernel.rotary_emb import rotary_emb_fwd
 from lightllm.models.llama.triton_kernel.silu_and_mul import silu_and_mul_fwd
 
@@ -32,7 +33,6 @@
 
 from lightllm.utils.sgl_utils import flash_attn_with_kvcache
 
-
 class LlamaTransformerLayerInfer(TransformerLayerInferTpl):
     """ """
 
@@ -134,16 +134,12 @@ def _bind_attention(self):
     def _att_norm(
         self, input, infer_state: LlamaInferStateInfo, layer_weight: LlamaTransformerLayerWeight
     ) -> torch.Tensor:
-        out = self.alloc_tensor(input.shape, input.dtype)
-        rms_norm(input, weight=layer_weight.att_norm_weight_.weight, eps=self.eps_, use_custom_tensor_mananger=True)
-        return out
+        return rmsnorm_forward(input, weight=layer_weight.att_norm_weight_.weight, eps=self.eps_, use_custom_tensor_mananger=True)
 
     def _ffn_norm(
         self, input, infer_state: LlamaInferStateInfo, layer_weight: LlamaTransformerLayerWeight
     ) -> torch.Tensor:
-        out = self.alloc_tensor(input.shape, input.dtype)
-        rms_norm(input, weight=layer_weight.ffn_norm_weight_.weight, eps=self.eps_, use_custom_tensor_mananger=True)
-        return out
+        return rmsnorm_forward(input, weight=layer_weight.ffn_norm_weight_.weight, eps=self.eps_, use_custom_tensor_mananger=True)
 
     def _get_qkv(
         self, input, cache_kv, infer_state: LlamaInferStateInfo, layer_weight: LlamaTransformerLayerWeight
diff --git a/lightllm/models/llama/triton_kernel/rmsnorm.py b/lightllm/models/llama/triton_kernel/rmsnorm.py
@@ -1,11 +1,11 @@
+import os
 import torch
-
 import triton
 import triton.language as tl
-
+from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 
 @triton.jit
-def _rms_norm_fwd_fused(
+def _rms_norm_low_accuracy_kernel(
     X,  # pointer to the input
     Y,  # pointer to the output
     W,  # pointer to the weights
@@ -41,9 +41,15 @@ def _rms_norm_fwd_fused(
         tl.store(Y + cols * y_stride1, y.to(Y.dtype.element_ty), mask=mask)
 
 
-def rmsnorm_forward(x: torch.Tensor, weight, eps, out=None):
+def rmsnorm_forward_low_accuracy(x: torch.Tensor, weight, eps, use_custom_tensor_mananger: bool = False):
     # allocate output
-    y = torch.empty_like(x) if out is None else out
+    if use_custom_tensor_mananger:
+        shape = x.shape
+        dtype = x.dtype
+        device = x.device
+        y = g_cache_manager.alloc_tensor(shape, dtype, device=device)
+    else:
+        y = torch.empty_like(x)
     # reshape input data into 2D tensor
     x_arg = x.view(-1, x.shape[-1])
     y_arg = y.view(-1, x.shape[-1])
@@ -61,7 +67,7 @@ def rmsnorm_forward(x: torch.Tensor, weight, eps, out=None):
     if BLOCK_SIZE > 16384:
         BLOCK_SIZE = 16384
     # enqueue kernel
-    _rms_norm_fwd_fused[(M,)](
+    _rms_norm_low_accuracy_kernel[(M,)](
         x_arg,
         y_arg,
         weight,
@@ -77,6 +83,80 @@ def rmsnorm_forward(x: torch.Tensor, weight, eps, out=None):
     return y
 
 
+@triton.jit
+def _rms_norm_high_accuracy_kernel(
+    input,
+    weight,
+    output,
+    in_row_stride: tl.constexpr,
+    in_col_stride: tl.constexpr,
+    out_row_stride: tl.constexpr,
+    out_col_stride: tl.constexpr,
+    eps: tl.constexpr,
+    N_COLS: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """Rms norm kernel."""
+    prog_id = tl.program_id(0)
+    offsets = tl.arange(0, BLOCK_N)
+
+    w = tl.load(weight + offsets, mask=offsets < N_COLS, other=0.0)
+
+    x_ptr = input + prog_id * in_row_stride
+    x = tl.load(x_ptr + offsets * in_col_stride, mask=offsets < N_COLS, other=0.0)
+    xf = x.to(tl.float32)
+
+    var = tl.sum(xf * xf, 0) * float(1.0 / N_COLS)
+    out = xf / tl.sqrt(var + eps)
+    out = (w * out).to(x.dtype)
+
+    out_ptr = output + prog_id * out_row_stride
+    tl.store(out_ptr + offsets * out_col_stride, out, mask=offsets < N_COLS)
+
+
+def rmsnorm_forward_high_accuracy(hidden_states: torch.Tensor, weight: torch.Tensor, eps: float = 1e-5, use_custom_tensor_mananger: bool = False):
+    """Rms norm."""
+
+    assert hidden_states.is_contiguous(), "hidden_states must be contiguous"
+
+    origin_shape = hidden_states.shape
+    hidden_dim = weight.shape[0]
+    assert hidden_dim == origin_shape[-1], f"hidden_dim {hidden_dim} != {origin_shape[-1]}"
+
+    rows = hidden_states.numel() // hidden_dim
+    if hidden_states.dim() == 3:  # (bs, seq_len, hidden_dim)
+        hidden_states = hidden_states.view(rows, hidden_dim)
+
+    in_row_stride, in_col_stride = hidden_states.stride(0), hidden_states.stride(1)
+
+    BLOCK_N = triton.next_power_of_2(hidden_dim)
+    if use_custom_tensor_mananger:
+        shape = hidden_states.shape
+        dtype = hidden_states.dtype
+        device = hidden_states.device
+        output = g_cache_manager.alloc_tensor(shape, dtype, device=device)
+    else:
+        output = torch.empty_like(hidden_states)
+
+    out_row_stride, out_col_stride = output.stride(0), output.stride(1)
+    grid = (rows,)
+    _rms_norm_high_accuracy_kernel[grid](
+        hidden_states,
+        weight,
+        output,
+        in_row_stride,
+        in_col_stride,
+        out_row_stride,
+        out_col_stride,
+        eps=eps,
+        N_COLS=hidden_dim,
+        BLOCK_N=BLOCK_N,
+        num_warps=4,
+        num_stages=3,
+    )
+    return output.reshape(origin_shape)
+
+
 def torch_rms_norm(x, weight, eps):
     return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps) * weight
 
@@ -88,11 +168,20 @@ def test_rms_norm(M, N, dtype, eps=1e-5, device="cuda"):
     weight = torch.rand(w_shape, dtype=dtype, device="cuda")
     x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device="cuda")
     # forward pass
-    y_tri = rmsnorm_forward(x, weight, eps)
+    y_tri = rmsnorm_forward_low_accuracy(x, weight, eps)
+    y_tri_high_acc = rmsnorm_forward_high_accuracy(x, weight, eps)
     y_ref = torch_rms_norm(x.to(torch.float32), weight.to(torch.float32), eps).to(dtype)
 
     # compare
-    print("type:", y_tri.dtype, y_ref.dtype)
+    print("type:", y_tri.dtype, y_ref.dtype, y_tri_high_acc.dtype)
     print("max delta:", torch.max(torch.abs(y_tri - y_ref)))
+    print("max delta:", torch.max(torch.abs(y_tri_high_acc - y_ref)))
     assert torch.allclose(y_tri, y_ref, atol=1e-2, rtol=0)
     return
+
+use_high_acc = os.getenv("RMSNORM_HIGH_ACCURACY", "False").upper() in ["ON", "TRUE", "1"]
+
+if use_high_acc:
+    rmsnorm_forward = rmsnorm_forward_high_accuracy
+else:
+    rmsnorm_forward = rmsnorm_forward_low_accuracy
diff --git a/lightllm/models/qwen3/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3/layer_infer/transformer_layer_infer.py
@@ -36,17 +36,18 @@ def _get_qkv(
             input, out=cache_kv.view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_) * self.head_dim_)
         ).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
 
-        rmsnorm_forward(
+        q = rmsnorm_forward(
             q.view(-1, self.head_dim_),
             weight=layer_weight.q_norm_weight_.weight,
             eps=self.eps_,
-            out=q.view(-1, self.head_dim_),
+            use_custom_tensor_mananger=True
         )
 
         cache_kv[:, : self.tp_k_head_num_, :] = rmsnorm_forward(
             cache_kv[:, : self.tp_k_head_num_, :].reshape(-1, cache_kv.shape[-1]),
             weight=layer_weight.k_norm_weight_.weight,
             eps=self.eps_,
+            use_custom_tensor_mananger=True
         ).view(-1, self.tp_k_head_num_, cache_kv.shape[-1])
 
         rotary_emb_fwd(
diff --git a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
@@ -60,17 +60,18 @@ def _get_qkv(
         cache_kv = layer_weight.kv_proj.mm(
             input, out=cache_kv.view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_) * self.head_dim_)
         ).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
-        rmsnorm_forward(
+        q = rmsnorm_forward(
             q.view(-1, self.head_dim_),
             weight=layer_weight.q_norm_weight_.weight,
             eps=self.eps_,
-            out=q.view(-1, self.head_dim_),
+            use_custom_tensor_mananger=True
         )
 
         cache_kv[:, : self.tp_k_head_num_, :] = rmsnorm_forward(
             cache_kv[:, : self.tp_k_head_num_, :].reshape(-1, cache_kv.shape[-1]),
             weight=layer_weight.k_norm_weight_.weight,
             eps=self.eps_,
+            use_custom_tensor_mananger=True
         ).view(-1, self.tp_k_head_num_, cache_kv.shape[-1])
 
         rotary_emb_fwd(
diff --git a/lightllm/models/vit/layer_infer/transformer_layer_infer.py b/lightllm/models/vit/layer_infer/transformer_layer_infer.py
@@ -7,11 +7,10 @@
 import triton
 
 from lightllm.models.vit.layer_weights.transformer_layer_weight import ViTTransformerLayerWeight
-from lightllm.models.llama.triton_kernel.rmsnorm import rmsnorm_forward, torch_rms_norm
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
 from lightllm.utils.dist_utils import get_current_rank_in_dp, get_dp_world_size
 from lightllm.models.vit.triton_kernel.gelu_vit import gelu_fwd
-from lightllm.models.vit.triton_kernel.rms_norm_vit import rms_norm
+from lightllm.models.llama.triton_kernel.rmsnorm import rmsnorm_forward_high_accuracy as rms_norm
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 
 
diff --git a/lightllm/models/vit/triton_kernel/rms_norm_vit.py b/lightllm/models/vit/triton_kernel/rms_norm_vit.py