Fix Gemma3 head norm (RMS)

turboderp · turboderp · commit b148bb42b8a9 · 2025-04-11T00:18:06.000+02:00
diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py
@@ -209,6 +209,7 @@ class Params:
 
             # Layer norm type
             norm = "rmsnorm"
+            headnorm = "layernorm"
 
             # RoPE style
             rope_style = RopeStyle.NEOX
@@ -520,6 +521,7 @@ class Params:
             self.lm.default_sliding_window_pattern = 6
             self.lm.default_rope_theta = 1e6
             self.lm.pos_id_index = 1
+            self.lm.headnorm = "rmsnorm"
 
             self.vt_prefix = "vision_tower.vision_model."
             self.vt.keys.update({
diff --git a/exllamav2/attn.py b/exllamav2/attn.py
@@ -301,6 +301,7 @@ def load(self, device_context: bool = True):
                 norm_weight,
                 norm_bias,
                 is_rms,
+                self.archparams.headnorm == "rmsnorm",
                 eps,
                 self.q_proj.q_handle,
                 self.k_proj.q_handle,
diff --git a/exllamav2/exllamav2_ext/cuda/head_norm.cu b/exllamav2/exllamav2_ext/cuda/head_norm.cu
@@ -2,7 +2,7 @@
 #include "util.cuh"
 #include "compat.cuh"
 
-#define MAX_HEAD_DIM 128
+#define MAX_HEAD_DIM 256
 #define WARP_SIZE 32
 #define MAX_WARPS (MAX_HEAD_DIM / WARP_SIZE)
 
@@ -16,7 +16,8 @@ __global__ void head_norm_kernel
     const float r_dim,
     const int rows,
     const int num_heads,
-    const int head_dim
+    const int head_dim,
+    const bool rms
 )
 {
     int warp_id = threadIdx.x / WARP_SIZE;
@@ -37,62 +38,106 @@ __global__ void head_norm_kernel
     float itemf[2];
     float sum = 0.0f;
 
-    half2 h01 = ((half2*)x_ptr)[t];
-    float f0 = __half2float(__low2half(h01));
-    float f1 = __half2float(__high2half(h01));
-    f0 = fmaxf(-65504.0f, fminf(f0, 65504.0f));
-    f1 = fmaxf(-65504.0f, fminf(f1, 65504.0f));
-    itemf[0] = f0;
-    itemf[1] = f1;
-    sum += f0;
-    sum += f1;
+    // RMS Norm
+
+    if (rms)
+    {
+        half2 h01 = ((half2*)x_ptr)[t];
+        float f0 = __half2float(__low2half(h01));
+        float f1 = __half2float(__high2half(h01));
+        f0 = fmaxf(-65504.0f, fminf(f0, 65504.0f));
+        f1 = fmaxf(-65504.0f, fminf(f1, 65504.0f));
+        itemf[0] = f0;
+        itemf[1] = f1;
+        sum = fma(f0, f0, sum);
+        sum = fma(f1, f1, sum);
+
+        // Shuffle to sum across lanes
+
+        for(int offset = warpSize / 2; offset > 0; offset /= 2) sum += __shfl_xor_sync(0xffffffff, sum, offset);
+        if (lane_id == 0) sums[warp_id] = sum;
+        __syncthreads();
+
+        // Sum of partial sums
+
+        sum = 0.0f;
+        for(int i = 0; i < num_warps; ++i) sum += sums[i];
+
+        // Get 1/sqrt(variance)
+
+        float rsvar = rsqrtf(sum * r_dim + epsilon);
+
+        // Normalize x, scaling by w
+
+        half2 w01 = w_ptr2[t];
+        float n0 = itemf[0] * __half2float(__low2half(w01)) * rsvar;
+        float n1 = itemf[1] * __half2float(__high2half(w01)) * rsvar;
+        half2 nh = __halves2half2(__float2half_rn(n0), __float2half_rn(n1));
+        if (b) nh = __hadd2(nh, b_ptr2[t]);  // Optional bias
+        y_ptr2[t] = nh;
+    }
+
+    // LayerNorm
+
+    else
+    {
+        half2 h01 = ((half2*)x_ptr)[t];
+        float f0 = __half2float(__low2half(h01));
+        float f1 = __half2float(__high2half(h01));
+        f0 = fmaxf(-65504.0f, fminf(f0, 65504.0f));
+        f1 = fmaxf(-65504.0f, fminf(f1, 65504.0f));
+        itemf[0] = f0;
+        itemf[1] = f1;
+        sum += f0;
+        sum += f1;
 
-    // Shuffle to sum across lanes
+        // Shuffle to sum across lanes
 
-    for(int offset = warpSize / 2; offset > 0; offset /= 2) sum += __shfl_xor_sync(0xffffffff, sum, offset);
-    if (lane_id == 0) sums[warp_id] = sum;
-    __syncthreads();
+        for(int offset = warpSize / 2; offset > 0; offset /= 2) sum += __shfl_xor_sync(0xffffffff, sum, offset);
+        if (lane_id == 0) sums[warp_id] = sum;
+        __syncthreads();
 
-    // Sum of partial sums
+        // Sum of partial sums
 
-    sum = 0.0f;
-    for(int i = 0; i < num_warps; ++i) sum += sums[i];
+        sum = 0.0f;
+        for(int i = 0; i < num_warps; ++i) sum += sums[i];
 
-    // Compute mean
+        // Compute mean
 
-    float mean = sum * r_dim;
+        float mean = sum * r_dim;
 
-    // Compute square of distance to mean
+        // Compute square of distance to mean
 
-    sum = 0.0f;
-    itemf[0] -= mean;
-    itemf[1] -= mean;
-    sum = fma(itemf[0], itemf[0], sum);
-    sum = fma(itemf[1], itemf[1], sum);
+        sum = 0.0f;
+        itemf[0] -= mean;
+        itemf[1] -= mean;
+        sum = fma(itemf[0], itemf[0], sum);
+        sum = fma(itemf[1], itemf[1], sum);
 
-    // Shuffle to sum across lanes
+        // Shuffle to sum across lanes
 
-    for(int offset = warpSize / 2; offset > 0; offset /= 2) sum += __shfl_xor_sync(0xffffffff, sum, offset);
-    if (lane_id == 0) sums[warp_id] = sum;
-    __syncthreads();
+        for(int offset = warpSize / 2; offset > 0; offset /= 2) sum += __shfl_xor_sync(0xffffffff, sum, offset);
+        if (lane_id == 0) sums[warp_id] = sum;
+        __syncthreads();
 
-    // Sum of partial sums
+        // Sum of partial sums
 
-    sum = 0.0f;
-    for(int i = 0; i < num_warps; ++i) sum += sums[i];
+        sum = 0.0f;
+        for(int i = 0; i < num_warps; ++i) sum += sums[i];
 
-    // Get 1/sqrt(variance)
+        // Get 1/sqrt(variance)
 
-    float rsvar = rsqrtf(sum * r_dim + epsilon);
+        float rsvar = rsqrtf(sum * r_dim + epsilon);
 
-    // Normalize x, scaling by w
+        // Normalize x, scaling by w
 
-    half2 w01 = w_ptr2[t];
-    float n0 = itemf[0] * __half2float(__low2half(w01)) * rsvar;
-    float n1 = itemf[1] * __half2float(__high2half(w01)) * rsvar;
-    half2 nh = __halves2half2(__float2half_rn(n0), __float2half_rn(n1));
-    if (b) nh = __hadd2(nh, b_ptr2[t]);  // Optional bias
-    y_ptr2[t] = nh;
+        half2 w01 = w_ptr2[t];
+        float n0 = itemf[0] * __half2float(__low2half(w01)) * rsvar;
+        float n1 = itemf[1] * __half2float(__high2half(w01)) * rsvar;
+        half2 nh = __halves2half2(__float2half_rn(n0), __float2half_rn(n1));
+        if (b) nh = __hadd2(nh, b_ptr2[t]);  // Optional bias
+        y_ptr2[t] = nh;
+    }
 }
 
 void head_norm_cuda
@@ -103,6 +148,7 @@ void head_norm_cuda
     const half* b,
     half* y,
     const float epsilon,
+    bool rms,
     const int rows,
     const int num_heads,
     const int head_dim,
@@ -117,7 +163,7 @@ void head_norm_cuda
 
     float r_dim = 1.0f / (float) head_dim;
 
-    head_norm_kernel<<<gridDim, blockDim, 0, stream>>>(x, w, b, y, epsilon, r_dim, rows, num_heads, head_dim);
+    head_norm_kernel<<<gridDim, blockDim, 0, stream>>>(x, w, b, y, epsilon, r_dim, rows, num_heads, head_dim, rms);
     if (graph) graph->attach_label(stream, label, 0);
 }
 
diff --git a/exllamav2/exllamav2_ext/cuda/head_norm.cuh b/exllamav2/exllamav2_ext/cuda/head_norm.cuh
@@ -15,6 +15,7 @@ void head_norm_cuda
     const half* b,
     half* y,
     const float epsilon,
+    bool rms,
     const int rows,
     const int num_heads,
     const int head_dim,
diff --git a/exllamav2/exllamav2_ext/cuda/q_attn.cu b/exllamav2/exllamav2_ext/cuda/q_attn.cu
@@ -86,6 +86,7 @@ QAttn::QAttn
     half* _layernorm,
     half* _layernorm_bias,
     bool _layernorm_is_rms,
+    bool _headnorm_is_rms,
     float _norm_epsilon,
     QMatrix* _q_proj,
     QMatrix* _k_proj,
@@ -115,6 +116,7 @@ QAttn::QAttn
     layernorm(_layernorm),
     layernorm_bias(_layernorm_bias),
     layernorm_is_rms(_layernorm_is_rms),
+    headnorm_is_rms(_headnorm_is_rms),
     norm_epsilon(_norm_epsilon),
     q_proj(_q_proj),
     k_proj(_k_proj),
@@ -281,10 +283,10 @@ void QAttn::forward_cuda_1_run
     apply_loras_cuda(stream, cublas_handle, v_proj_lora, loras, v_proj, norm_state, temp_v, lora_temp, q_len * batch_size);
 
     if (q_norm)
-        head_norm_cuda(stream, temp_q, q_norm, NULL, temp_q, norm_epsilon, q_len * batch_size, num_heads, head_dim, graph, KernelLabels::Q_NORM);
+        head_norm_cuda(stream, temp_q, q_norm, NULL, temp_q, norm_epsilon, headnorm_is_rms, q_len * batch_size, num_heads, head_dim, graph, KernelLabels::Q_NORM);
 
     if (k_norm)
-        head_norm_cuda(stream, temp_k, k_norm, NULL, temp_k, norm_epsilon, q_len * batch_size, num_kv_heads, head_dim, graph, KernelLabels::K_NORM);
+        head_norm_cuda(stream, temp_k, k_norm, NULL, temp_k, norm_epsilon, headnorm_is_rms, q_len * batch_size, num_kv_heads, head_dim, graph, KernelLabels::K_NORM);
 
 //    rope_cuda(stream, temp_q, sin, cos, batch_size, q_len * num_heads,    head_dim, num_heads,    past_len, past_lens);
 //    rope_cuda(stream, temp_k, sin, cos, batch_size, q_len * num_kv_heads, head_dim, num_kv_heads, past_len, past_lens);
diff --git a/exllamav2/exllamav2_ext/cuda/q_attn.cuh b/exllamav2/exllamav2_ext/cuda/q_attn.cuh
@@ -44,6 +44,7 @@ public:
     half* post_layernorm;
     half* post_layernorm_bias;
     bool layernorm_is_rms;
+    bool headnorm_is_rms;
     float norm_epsilon;
 
     half* q_norm;
@@ -86,6 +87,7 @@ public:
         half* _layernorm,
         half* _layernorm_bias,
         bool _layernorm_is_rms,
+        bool _headnorm_is_rms,
         float _norm_epsilon,
         QMatrix* _q_proj,
         QMatrix* _k_proj,
diff --git a/exllamav2/exllamav2_ext/ext_norm.cpp b/exllamav2/exllamav2_ext/ext_norm.cpp
@@ -162,7 +162,8 @@ void head_norm
     torch::Tensor w,
     torch::Tensor b,
     torch::Tensor y,
-    float epsilon
+    float epsilon,
+    bool rms
 )
 {
     TORCH_CHECK_DTYPE(x, kHalf);
@@ -191,6 +192,7 @@ void head_norm
         b.device().is_meta() ? NULL : (half*) b.data_ptr(),
         (half*) y.data_ptr(),
         epsilon,
+        rms,
         rows,
         num_heads,
         head_dim
@@ -202,8 +204,9 @@ void head_norm_
     torch::Tensor x,
     torch::Tensor w,
     torch::Tensor b,
-    float epsilon
+    float epsilon,
+    bool rms
 )
 {
-    head_norm(x, w, b, x, epsilon);
+    head_norm(x, w, b, x, epsilon, rms);
 }
diff --git a/exllamav2/exllamav2_ext/ext_norm.h b/exllamav2/exllamav2_ext/ext_norm.h
@@ -46,15 +46,17 @@ void head_norm
     torch::Tensor w,
     torch::Tensor b,
     torch::Tensor y,
-    float epsilon
+    float epsilon,
+    bool rms
 );
 
 void head_norm_
 (
     torch::Tensor x,
     torch::Tensor w,
     torch::Tensor b,
-    float epsilon
+    float epsilon,
+    bool rms
 );
 
 
diff --git a/exllamav2/exllamav2_ext/ext_qattn.cpp b/exllamav2/exllamav2_ext/ext_qattn.cpp
@@ -26,6 +26,7 @@ uintptr_t make_q_attn
     torch::Tensor layernorm,
     torch::Tensor layernorm_bias,
     bool layernorm_is_rms,
+    bool headnorm_is_rms,
     float norm_epsilon,
     uintptr_t q_q_proj,
     uintptr_t q_k_proj,
@@ -71,6 +72,7 @@ uintptr_t make_q_attn
         layernorm.is_meta() ? NULL : (half*) layernorm.data_ptr(),
         layernorm_bias.is_meta() ? NULL : (half*) layernorm_bias.data_ptr(),
         layernorm_is_rms,
+        headnorm_is_rms,
         norm_epsilon,
         qm_q_proj,
         qm_k_proj,
diff --git a/exllamav2/exllamav2_ext/ext_qattn.h b/exllamav2/exllamav2_ext/ext_qattn.h
@@ -4,6 +4,7 @@ uintptr_t make_q_attn
     torch::Tensor layernorm,
     torch::Tensor layernorm_bias,
     bool layernorm_is_rms,
+    bool headnorm_is_rms,
     float norm_epsilon,
     uintptr_t q_q_proj,
     uintptr_t q_k_proj,
diff --git a/exllamav2/headnorm.py b/exllamav2/headnorm.py
@@ -129,12 +129,12 @@ def forward(
         **kwargs
     ) -> torch.Tensor | dict[str: torch.Tensor]:
 
-        norm = torch.empty_like(hidden_states)
         ext_c.head_norm(hidden_states,
                         self.weight.data,
                         self.bias.data if self.bias is not None else none_tensor,
                         hidden_states,
-                        self.variance_epsilon)
+                        self.variance_epsilon,
+                        self.archparams.headnorm == "rmsnorm")
 
         if intermediates:
             return {"hidden_states": hidden_states}
@@ -153,13 +153,21 @@ def forward_torch(
         **kwargs
     ) -> torch.Tensor | dict[str: torch.Tensor]:
 
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        mean = hidden_states.mean(-1, keepdim = True)
-        variance = (hidden_states - mean).pow(2).mean(-1, keepdim = True)
-        hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon)
-        hidden_states = self.weight.to(torch.float32) * hidden_states
-        hidden_states = hidden_states.to(input_dtype)
+        if self.archparams.headnorm == "rmsnorm":
+            input_dtype = hidden_states.dtype
+            hidden_states = hidden_states.to(torch.float32)
+            variance = hidden_states.pow(2).mean(-1, keepdim = True)
+            hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+            hidden_states = self.weight.to(torch.float32) * hidden_states
+            hidden_states = hidden_states.to(input_dtype)
+        else:
+            input_dtype = hidden_states.dtype
+            hidden_states = hidden_states.to(torch.float32)
+            mean = hidden_states.mean(-1, keepdim = True)
+            variance = (hidden_states - mean).pow(2).mean(-1, keepdim = True)
+            hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon)
+            hidden_states = self.weight.to(torch.float32) * hidden_states
+            hidden_states = hidden_states.to(input_dtype)
 
         if intermediates:
             return {"hidden_states": hidden_states}

Original file line number	Diff line number	Diff line change
`@@ -162,7 +162,8 @@ void head_norm`
`162`	`162`	`torch::Tensor w,`
`163`	`163`	`torch::Tensor b,`
`164`	`164`	`torch::Tensor y,`
`165`		`- float epsilon`
	`165`	`+ float epsilon,`
	`166`	`+ bool rms`
`166`	`167`	`)`
`167`	`168`	`{`
`168`	`169`	`TORCH_CHECK_DTYPE(x, kHalf);`
`@@ -191,6 +192,7 @@ void head_norm`
`191`	`192`	`b.device().is_meta() ? NULL : (half*) b.data_ptr(),`
`192`	`193`	`(half*) y.data_ptr(),`
`193`	`194`	`epsilon,`
	`195`	`+ rms,`
`194`	`196`	`rows,`
`195`	`197`	`num_heads,`
`196`	`198`	`head_dim`
`@@ -202,8 +204,9 @@ void head_norm_`
`202`	`204`	`torch::Tensor x,`
`203`	`205`	`torch::Tensor w,`
`204`	`206`	`torch::Tensor b,`
`205`		`- float epsilon`
	`207`	`+ float epsilon,`
	`208`	`+ bool rms`
`206`	`209`	`)`
`207`	`210`	`{`
`208`		`- head_norm(x, w, b, x, epsilon);`
	`211`	`+ head_norm(x, w, b, x, epsilon, rms);`
`209`	`212`	`}`