bugfix: fix layernorm & rmsnorm f16 overflow (#335)

hebangwen · web-flow · commit ace5f16e42c1 · 2025-06-10T11:18:02.000+08:00
* [Test] Add f16 overflow testcase in layernorm and rmsnorm

* [Fix] call correct `rms_norm_f16x8_f32` and fix epsilon position
diff --git a/kernels/layer-norm/layer_norm.cu b/kernels/layer-norm/layer_norm.cu
@@ -197,7 +197,7 @@ __global__ void layer_norm_f16_f16_kernel(half *x, half *y, float g, float b,
   half variance = (value - s_mean) * (value - s_mean);
   variance = block_reduce_sum_f16_f16<NUM_THREADS>(variance);
   if (tid == 0)
-    s_variance = hrsqrt(variance / (K_ + epsilon));
+    s_variance = hrsqrt(variance / K_ + epsilon);
   // wait for s_variance in shared memory to be ready for all threads
   __syncthreads();
   if (idx < N * K) {
@@ -232,7 +232,7 @@ __global__ void layer_norm_f16x2_f16_kernel(half *x, half *y, float g, float b,
   half variance = reg_x_hat.x * reg_x_hat.x + reg_x_hat.y * reg_x_hat.y;
   variance = block_reduce_sum_f16_f16<NUM_THREADS>(variance);
   if (tid == 0)
-    s_variance = hrsqrt(variance / (K_ + epsilon));
+    s_variance = hrsqrt(variance / K_ + epsilon);
   // wait for s_variance in shared memory to be ready for all threads
   __syncthreads();
   if (idx < N * K) {
@@ -300,7 +300,7 @@ __global__ void layer_norm_f16x8_f16_kernel(half *x, half *y, float g, float b,
 
   variance = block_reduce_sum_f16_f16<NUM_THREADS>(variance);
   if (tid == 0)
-    s_variance = hrsqrt(variance / (K_ + epsilon));
+    s_variance = hrsqrt(variance / K_ + epsilon);
   // wait for s_variance in shared memory to be ready for all threads
   __syncthreads();
   // manual unroll
@@ -390,7 +390,7 @@ __global__ void layer_norm_f16x8_pack_f16_kernel(half *x, half *y, float g,
   }
   variance = block_reduce_sum_f16_f16<NUM_THREADS>(variance);
   if (tid == 0)
-    s_variance = hrsqrt(variance / (K_ + epsilon));
+    s_variance = hrsqrt(variance / K_ + epsilon);
   // wait for s_variance in shared memory to be ready for all threads
   __syncthreads();
 
diff --git a/kernels/layer-norm/layer_norm.py b/kernels/layer-norm/layer_norm.py
@@ -96,6 +96,18 @@ def run_benchmark(
 run_benchmark(naive_layer_norm, x_f16, "f16_th")
 print("-" * 85)
 
+print(" " * 40 + f"f16 overflow without f32")
+print("-" * 85)
+x_f16 = x.half() * 100 # this will cause overflow for kernels without `f32`
+run_benchmark(lib.layer_norm_f16_f16, x_f16, "f16f16", out_f16)
+run_benchmark(lib.layer_norm_f16_f32, x_f16, "f16f32", out_f16)
+run_benchmark(lib.layer_norm_f16x2_f16, x_f16, "f16x2f16", out_f16)
+run_benchmark(lib.layer_norm_f16x8_f16, x_f16, "f16x8f16", out_f16)
+run_benchmark(lib.layer_norm_f16x8_pack_f16, x_f16, "f16x8packf16", out_f16)
+run_benchmark(lib.layer_norm_f16x8_pack_f32, x_f16, "f16x8packf32", out_f16)
+run_benchmark(naive_layer_norm, x_f16, "f16_th")
+print("-" * 85)
+
 print("-" * 85)
 N, K = 4096, 1024
 print(" " * 40 + f"N={N}, K={K}")
diff --git a/kernels/rms-norm/rms_norm.cu b/kernels/rms-norm/rms_norm.cu
@@ -172,7 +172,7 @@ __global__ void rms_norm_f16_f16_kernel(half *x, half *y, float g, int N,
   half variance = value * value;
   variance = block_reduce_sum_f16_f16<NUM_THREADS>(variance);
   if (tid == 0)
-    s_variance = hrsqrt(variance / (K_ + epsilon));
+    s_variance = hrsqrt(variance / K_ + epsilon);
   // wait for s_variance in shared memory to be ready for all threads
   __syncthreads();
   if (idx < N * K)
@@ -195,7 +195,7 @@ __global__ void rms_norm_f16x2_f16_kernel(half *x, half *y, float g, int N,
                                 : __float2half(0.0f);
   variance = block_reduce_sum_f16_f16<NUM_THREADS>(variance);
   if (tid == 0)
-    s_variance = hrsqrt(variance / (K_ + epsilon));
+    s_variance = hrsqrt(variance / K_ + epsilon);
   // wait for s_variance in shared memory to be ready for all threads
   __syncthreads();
   half2 reg_y;
@@ -241,7 +241,7 @@ __global__ void rms_norm_f16x8_f16_kernel(half *x, half *y, float g, int N,
   variance += HALF2_VARIANCE(reg_x_3, 6);
   variance = block_reduce_sum_f16_f16<NUM_THREADS>(variance);
   if (tid == 0)
-    s_variance = hrsqrt(variance / (K_ + epsilon));
+    s_variance = hrsqrt(variance / K_ + epsilon);
   // wait for s_variance in shared memory to be ready for all threads
   __syncthreads();
   // manual unroll
@@ -292,7 +292,7 @@ __global__ void rms_norm_f16x8_f32_kernel(half *x, half *y, float g, int N,
 
   variance = block_reduce_sum_f32<NUM_THREADS>(variance);
   if (tid == 0)
-    s_variance = rsqrtf(variance / ((float)K + epsilon));
+    s_variance = rsqrtf(variance / (float)K + epsilon);
   // wait for s_variance in shared memory to be ready for all threads
   __syncthreads();
   // manual unroll
@@ -328,7 +328,7 @@ __global__ void rms_norm_f16_f32_kernel(half *x, half *y, float g, int N,
   float variance = value * value;
   variance = block_reduce_sum_f32<NUM_THREADS>(variance);
   if (tid == 0)
-    s_variance = rsqrtf(variance / ((float)K + epsilon));
+    s_variance = rsqrtf(variance / (float)K + epsilon);
   // wait for s_variance in shared memory to be ready for all threads
   __syncthreads();
   if (idx < N * K) {
@@ -360,7 +360,7 @@ __global__ void rms_norm_f16x8_pack_f16_kernel(half *x, half *y, float g, int N,
   }
   variance = block_reduce_sum_f16_f16<NUM_THREADS>(variance);
   if (tid == 0)
-    s_variance = hrsqrt(variance / (K_ + epsilon));
+    s_variance = hrsqrt(variance / K_ + epsilon);
   // wait for s_variance in shared memory to be ready for all threads
   __syncthreads();
 
@@ -396,7 +396,7 @@ __global__ void rms_norm_f16x8_pack_f32_kernel(half *x, half *y, float g, int N,
   }
   variance = block_reduce_sum_f32<NUM_THREADS>(variance);
   if (tid == 0)
-    s_variance = rsqrtf(variance / ((float)K + epsilon));
+    s_variance = rsqrtf(variance / (float)K + epsilon);
   // wait for s_variance in shared memory to be ready for all threads
   __syncthreads();
 
@@ -626,7 +626,7 @@ void rms_norm_f32x4(torch::Tensor x, torch::Tensor y, float g) {
   }
 
 #define LANUCH_RMS_NORM_F16x8F32_KERNEL(K)                                     \
-  rms_norm_f16x8_f16_kernel<(K) / 8>                                           \
+  rms_norm_f16x8_f32_kernel<(K) / 8>                                           \
       <<<grid, block>>>(reinterpret_cast<half *>(x.data_ptr()),                \
                         reinterpret_cast<half *>(y.data_ptr()), g, N, (K));
 
diff --git a/kernels/rms-norm/rms_norm.py b/kernels/rms-norm/rms_norm.py
@@ -95,6 +95,19 @@ def run_benchmark(
 run_benchmark(naive_rms_norm, x_f16, "f16_th")
 print("-" * 85)
 
+print(" " * 40 + f"f16 overflow without f32")
+print("-" * 85)
+x_f16 = x.half() * 100  # this will cause overflow for kernels without `f32`
+run_benchmark(lib.rms_norm_f16_f16, x_f16, "f16f16", out_f16)
+run_benchmark(lib.rms_norm_f16_f32, x_f16, "f16f32", out_f16)
+run_benchmark(lib.rms_norm_f16x2_f16, x_f16, "f16x2f16", out_f16)
+run_benchmark(lib.rms_norm_f16x8_f16, x_f16, "f16x8f16", out_f16)
+run_benchmark(lib.rms_norm_f16x8_f32, x_f16, "f16x8f32", out_f16)
+run_benchmark(lib.rms_norm_f16x8_pack_f16, x_f16, "f16x8packf16", out_f16)
+run_benchmark(lib.rms_norm_f16x8_pack_f32, x_f16, "f16x8packf32", out_f16)
+run_benchmark(naive_rms_norm, x_f16, "f16_th")
+print("-" * 85)
+
 print("-" * 85)
 N, K = 4096, 1024
 print(" " * 40 + f"N={N}, K={K}")