[LayerNorm][FP16] support fp16x8_pack_f32 kernel (#48)

DefTruth · web-flow · commit 93636df2194a · 2024-09-26T09:50:03.000+08:00
* Update README.md

* Update layer_norm.cu

* Update layer_norm.py

* Update README.md

* Update README.md

* Update layer_norm.py
diff --git a/README.md b/README.md
@@ -81,6 +81,7 @@
 | ✔️ [layer_norm_f16x2_f16(per token)](./layer-norm/layer_norm.cu)|f16|f16|[link](./layer-norm/)|⭐️⭐️|  
 | ✔️ [layer_norm_f16x8_f16(per token)](./layer-norm/layer_norm.cu)|f16|f16|[link](./layer-norm/)|⭐️⭐️|  
 | ✔️ [layer_norm_f16x8_pack_f16(per token)](./layer-norm/layer_norm.cu)|f16|f16|[link](./layer-norm/)|⭐️⭐️|  
+| ✔️ [layer_norm_f16x8_pack_f32(per token)](./layer-norm/layer_norm.cu)|f16|f32|[link](./layer-norm/)|⭐️⭐️|  
 | ✔️ [layer_norm_f16_f32(per token)](./layer-norm/layer_norm.cu)|f16|f32|[link](./layer-norm/)|⭐️⭐️|  
 | ✔️ [rms_norm_f32(per token)](./rms-norm/rms_norm.cu)|f32|f32|[link](./rms-norm/)|⭐️⭐️|  
 | ✔️ [rms_norm_f32x4(per token)](./rms-norm/rms_norm.cu)|f32|f32|[link](./rms-norm/)|⭐️⭐️|  
diff --git a/layer-norm/README.md b/layer-norm/README.md
@@ -10,6 +10,7 @@
 - [X] layer_norm_f16x2_f16_kernel
 - [X] layer_norm_f16x8_f16_kernel
 - [X] layer_norm_f16x8_pack_f16_kernel
+- [X] layer_norm_f16x8_pack_f32_kernel
 - [X] layer_norm_f16_f32_kernel
 - [X] PyTorch bindings
 
@@ -27,64 +28,70 @@ python3 layer_norm.py
 -------------------------------------------------------------------------------------
                                         N=4096, K=512
 -------------------------------------------------------------------------------------
-          out_f32: ['-1.76292217 ', '0.04765211  ', '0.50859255  '], time:0.01897240ms
-        out_f32x4: ['-1.76292217 ', '0.04765211  ', '0.50859255  '], time:0.00600266ms
-       out_f32_th: ['-1.76119995 ', '0.04760556  ', '0.50809568  '], time:0.07085347ms
+          out_f32: ['-0.95119929 ', '0.65728813  ', '-0.27701864 '], time:0.01898599ms
+        out_f32x4: ['-0.95119929 ', '0.65728813  ', '-0.27701864 '], time:0.00600958ms
+       out_f32_th: ['-0.95026982 ', '0.65664589  ', '-0.27674797 '], time:0.07345414ms
 -------------------------------------------------------------------------------------
-       out_f16f16: ['-1.76367188 ', '0.04763794  ', '0.50878906  '], time:0.01869035ms
-       out_f16f32: ['-1.76367188 ', '0.04766846  ', '0.50878906  '], time:0.01897883ms
-     out_f16x2f16: ['-1.76367188 ', '0.04766846  ', '0.50878906  '], time:0.00951219ms
-     out_f16x8f16: ['-1.76367188 ', '0.04766846  ', '0.50878906  '], time:0.00467825ms
- out_f16x8packf16: ['-1.76367188 ', '0.04763794  ', '0.50878906  '], time:0.00430202ms
-       out_f16_th: ['-1.76171875 ', '0.04760742  ', '0.50830078  '], time:0.07009959ms
+       out_f16f16: ['-0.95068359 ', '0.65722656  ', '-0.27709961 '], time:0.01866651ms
+       out_f16f32: ['-0.95117188 ', '0.65722656  ', '-0.27709961 '], time:0.01897073ms
+     out_f16x2f16: ['-0.95068359 ', '0.65722656  ', '-0.27709961 '], time:0.00952697ms
+     out_f16x8f16: ['-0.95068359 ', '0.65722656  ', '-0.27709961 '], time:0.00470805ms
+ out_f16x8packf16: ['-0.95117188 ', '0.65673828  ', '-0.27709961 '], time:0.00427437ms
+ out_f16x8packf32: ['-0.95117188 ', '0.65722656  ', '-0.27709961 '], time:0.00418639ms
+       out_f16_th: ['-0.94970703 ', '0.65673828  ', '-0.27685547 '], time:0.07291913ms
 -------------------------------------------------------------------------------------
 -------------------------------------------------------------------------------------
                                         N=4096, K=1024
 -------------------------------------------------------------------------------------
-          out_f32: ['-0.65619785 ', '1.33576787  ', '-0.29172164 '], time:0.05123448ms
-        out_f32x4: ['-0.65619785 ', '1.33576787  ', '-0.29172164 '], time:0.01073551ms
-       out_f32_th: ['-0.65587735 ', '1.33511555  ', '-0.29157916 '], time:0.07034254ms
+          out_f32: ['0.81839228  ', '0.36616057  ', '-1.71588480 '], time:0.05122757ms
+        out_f32x4: ['0.81839228  ', '0.36616057  ', '-1.71588480 '], time:0.01071095ms
+       out_f32_th: ['0.81799269  ', '0.36598179  ', '-1.71504688 '], time:0.07267237ms
 -------------------------------------------------------------------------------------
-       out_f16f16: ['-0.65576172 ', '1.3359375   ', '-0.29174805 '], time:0.05320668ms
-       out_f16f32: ['-0.65576172 ', '1.3359375   ', '-0.29150391 '], time:0.05061388ms
-     out_f16x2f16: ['-0.65576172 ', '1.3359375   ', '-0.29174805 '], time:0.01861978ms
-     out_f16x8f16: ['-0.65576172 ', '1.3359375   ', '-0.29174805 '], time:0.00745845ms
- out_f16x8packf16: ['-0.65576172 ', '1.3359375   ', '-0.29174805 '], time:0.00648832ms
-       out_f16_th: ['-0.65527344 ', '1.33398438  ', '-0.29150391 '], time:0.07068610ms
+       out_f16f16: ['0.81835938  ', '0.36596680  ', '-1.71484375 '], time:0.05317926ms
+       out_f16f32: ['0.81835938  ', '0.36621094  ', '-1.71582031 '], time:0.05062103ms
+     out_f16x2f16: ['0.81884766  ', '0.36621094  ', '-1.71679688 '], time:0.01855445ms
+     out_f16x8f16: ['0.81884766  ', '0.36621094  ', '-1.71679688 '], time:0.00742888ms
+ out_f16x8packf16: ['0.81884766  ', '0.36621094  ', '-1.71679688 '], time:0.00645399ms
+ out_f16x8packf32: ['0.81835938  ', '0.36621094  ', '-1.71582031 '], time:0.00634456ms
+       out_f16_th: ['0.81835938  ', '0.36596680  ', '-1.71582031 '], time:0.07386255ms
 -------------------------------------------------------------------------------------
 -------------------------------------------------------------------------------------
                                         N=4096, K=2048
 -------------------------------------------------------------------------------------
-        out_f32x4: ['0.92044634  ', '0.37421227  ', '-2.49094558 '], time:0.02202415ms
-       out_f32_th: ['0.92022169  ', '0.37412092  ', '-2.49033761 '], time:0.12026787ms
+        out_f32x4: ['-0.65341073 ', '0.10270299  ', '-0.06597849 '], time:0.02200651ms
+       out_f32_th: ['-0.65325129 ', '0.10267793  ', '-0.06596238 '], time:0.12027287ms
 -------------------------------------------------------------------------------------
-     out_f16x2f16: ['0.92041016  ', '0.37426758  ', '-2.49023438 '], time:0.05346847ms
-     out_f16x8f16: ['0.92041016  ', '0.37426758  ', '-2.49023438 '], time:0.01381087ms
- out_f16x8packf16: ['0.92041016  ', '0.37426758  ', '-2.49023438 '], time:0.01159072ms
-       out_f16_th: ['0.92041016  ', '0.37426758  ', '-2.49023438 '], time:0.08454061ms
+     out_f16x2f16: ['-0.65332031 ', '0.10266113  ', '-0.06591797 '], time:0.05352354ms
+     out_f16x8f16: ['-0.65380859 ', '0.10272217  ', '-0.06597900 '], time:0.01377678ms
+ out_f16x8packf16: ['-0.65332031 ', '0.10266113  ', '-0.06591797 '], time:0.01154637ms
+ out_f16x8packf32: ['-0.65332031 ', '0.10272217  ', '-0.06597900 '], time:0.01166582ms
+       out_f16_th: ['-0.65380859 ', '0.10272217  ', '-0.06597900 '], time:0.08442783ms
 -------------------------------------------------------------------------------------
 -------------------------------------------------------------------------------------
                                         N=4096, K=4096
 -------------------------------------------------------------------------------------
-        out_f32x4: ['-2.05339074 ', '0.25924587  ', '0.42393678  '], time:0.18885875ms
-       out_f32_th: ['-2.05314016 ', '0.25921422  ', '0.42388505  '], time:0.77834105ms
+        out_f32x4: ['2.38733387  ', '-0.03023042 ', '0.66022825  '], time:0.18884635ms
+       out_f32_th: ['2.38704205  ', '-0.03022672 ', '0.66014749  '], time:0.77852798ms
 -------------------------------------------------------------------------------------
-     out_f16x8f16: ['-2.05273438 ', '0.2590332   ', '0.42382812  '], time:0.03327322ms
- out_f16x8packf16: ['-2.05273438 ', '0.2590332   ', '0.42382812  '], time:0.02402687ms
-       out_f16_th: ['-2.05273438 ', '0.2590332   ', '0.42382812  '], time:0.17436218ms
+     out_f16x8f16: ['2.38671875  ', '-0.03024292 ', '0.66015625  '], time:0.03325391ms
+ out_f16x8packf16: ['2.38671875  ', '-0.03024292 ', '0.66015625  '], time:0.02401376ms
+ out_f16x8packf32: ['2.38671875  ', '-0.03021240 ', '0.66064453  '], time:0.02381730ms
+       out_f16_th: ['2.38671875  ', '-0.03021240 ', '0.66015625  '], time:0.17546010ms
 -------------------------------------------------------------------------------------
 -------------------------------------------------------------------------------------
                                         N=4096, K=8192
 -------------------------------------------------------------------------------------
-     out_f16x8f16: ['-1.0234375  ', '-0.3371582  ', '-1.54882812 '], time:0.19311237ms
- out_f16x8packf16: ['-1.0234375  ', '-0.33691406 ', '-1.54882812 '], time:0.18668032ms
-       out_f16_th: ['-1.0234375  ', '-0.33691406 ', '-1.54882812 '], time:0.84443021ms
+     out_f16x8f16: ['0.15905762  ', '1.06542969  ', '-0.19396973 '], time:0.19306803ms
+ out_f16x8packf16: ['0.15905762  ', '1.06542969  ', '-0.19396973 '], time:0.18665886ms
+ out_f16x8packf32: ['0.15905762  ', '1.06542969  ', '-0.19396973 '], time:0.18657684ms
+       out_f16_th: ['0.15905762  ', '1.06542969  ', '-0.19396973 '], time:0.84462571ms
 -------------------------------------------------------------------------------------
 -------------------------------------------------------------------------------------
                                         N=8192, K=8192
 -------------------------------------------------------------------------------------
-     out_f16x8f16: ['-1.03320312 ', '0.41455078  ', '-0.49707031 '], time:0.38361049ms
- out_f16x8packf16: ['-1.03320312 ', '0.41455078  ', '-0.49707031 '], time:0.40809250ms
-       out_f16_th: ['-1.03320312 ', '0.41455078  ', '-0.49707031 '], time:1.99517584ms
+     out_f16x8f16: ['-0.53662109 ', '2.359375    ', '0.78027344  '], time:0.38366604ms
+ out_f16x8packf16: ['-0.53662109 ', '2.359375    ', '0.78027344  '], time:0.40789628ms
+ out_f16x8packf32: ['-0.53613281 ', '2.359375    ', '0.78027344  '], time:0.40818143ms
+       out_f16_th: ['-0.53662109 ', '2.359375    ', '0.78027344  '], time:1.99523735ms
 -------------------------------------------------------------------------------------
 ```
diff --git a/layer-norm/layer_norm.cu b/layer-norm/layer_norm.cu
@@ -376,6 +376,52 @@ __global__ void layer_norm_f16x8_pack_f16_kernel(half* x, half* y, float g, floa
   // TODO: support non 8-multiple K here
 }
 
+template<const int NUM_THREADS=256>
+__global__ void layer_norm_f16x8_pack_f32_kernel(half* x, half* y, float g, float b, int N, int K) {
+  int tid = threadIdx.x; // 0..K-1
+  int bid = blockIdx.x; // 0..N-1
+  int idx = (bid * blockDim.x + threadIdx.x) * 8;
+  const float epsilon = 1e-5f;
+
+  __shared__ float s_mean; // shared within block
+  __shared__ float s_variance; // shared within block
+  // temporary register(memory), .local space in ptx, addressable
+  half pack_x[8], pack_y[8]; // 8x16 bits=128 bits.
+  // reinterpret as float4 and load 128 bits in 1 memory issue.
+  LDST128BITS(pack_x[0]) = LDST128BITS(x[idx]); // load 128 bits
+
+  float value = 0.0f;
+  #pragma unroll
+  for (int i = 0; i < 8; ++i) {
+    value += ((idx + i) < N * K ? __half2float(pack_x[i]) : 0.0f);
+  }
+  float sum = block_reduce_sum_f32<NUM_THREADS>(value);
+  if (tid == 0) s_mean = sum / (float) K;
+  // wait for s_mean in shared memory to be ready for all threads
+  __syncthreads();
+
+  float variance = 0.0f;
+  #pragma unroll
+  for (int i = 0; i < 8; ++i) {
+    float v_hat = __half2float(pack_x[i]) - s_mean;
+    variance += ((idx + i) < N * K ? v_hat * v_hat : 0.0f);
+  }
+  variance = block_reduce_sum_f32<NUM_THREADS>(variance);
+  if (tid == 0) s_variance = rsqrtf(variance / ((float) K + epsilon));
+  // wait for s_variance in shared memory to be ready for all threads
+  __syncthreads();
+  
+  #pragma unroll
+  for (int i = 0; i < 8; ++i) {
+    pack_y[i] = __float2half(
+      __fmaf_rn(((__half2float(pack_x[i]) - s_mean) * s_variance), g, b)
+    );
+  }
+  // reinterpret as float4 and store 128 bits in 1 memory issue.
+  if ((idx + 7) < N * K) { LDST128BITS(y[idx]) = LDST128BITS(pack_y[0]); }
+  // TODO: support non 8-multiple K here
+}
+
 // --------------------- PyTorch bindings for custom kernel -----------------------
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func) \
@@ -463,24 +509,6 @@ layer_norm_f32x4_kernel<(K)/4><<<grid, block>>>( \
     break;                                     \
   } 
 
-void layer_norm_f32(torch::Tensor x, torch::Tensor y, float g, float b) {
-  CHECK_TORCH_TENSOR_DTYPE(x, torch::kFloat32)       
-  CHECK_TORCH_TENSOR_DTYPE(y, torch::kFloat32)
-  CHECK_TORCH_TENSOR_SHAPE(x, y)
-  const int N = x.size(0);
-  const int K = x.size(1);
-  DISPATCH_LAYER_NORM_F32_KERNEL(N, K)
-}
-
-void layer_norm_f32x4(torch::Tensor x, torch::Tensor y, float g, float b) {
-  CHECK_TORCH_TENSOR_DTYPE(x, torch::kFloat32)       
-  CHECK_TORCH_TENSOR_DTYPE(y, torch::kFloat32)
-  CHECK_TORCH_TENSOR_SHAPE(x, y)
-  const int N = x.size(0);
-  const int K = x.size(1);
-  DISPATCH_LAYER_NORM_F32x4_KERNEL(N, K)
-}
-
 // fp16
 #define LANUCH_LAYER_NORM_F16F16_KERNEL(K)       \
 layer_norm_f16_f16_kernel<(K)><<<grid, block>>>( \
@@ -663,6 +691,65 @@ layer_norm_f16x8_pack_f16_kernel<(K)/8><<<grid, block>>>( \
     break;                                              \
   } 
 
+#define LANUCH_LAYER_NORM_F16x8_PACK_F32_KERNEL(K)        \
+layer_norm_f16x8_pack_f32_kernel<(K)/8><<<grid, block>>>( \
+  reinterpret_cast<half*>(x.data_ptr()),                  \
+  reinterpret_cast<half*>(y.data_ptr()),                  \
+  g, b, N, (K));  
+
+#define DISPATCH_LAYER_NORM_F16x8_PACK_F32_KERNEL(N, K) \
+  dim3 block((K)/8);                                    \
+  dim3 grid((N));                                       \
+  switch ((K))                                          \
+  {                                                     \
+  case 64:                                              \
+    LANUCH_LAYER_NORM_F16x8_PACK_F32_KERNEL(64)         \
+    break;                                              \
+  case 128:                                             \
+    LANUCH_LAYER_NORM_F16x8_PACK_F32_KERNEL(128)        \
+    break;                                              \
+  case 256:                                             \
+    LANUCH_LAYER_NORM_F16x8_PACK_F32_KERNEL(256)        \
+    break;                                              \
+  case 512:                                             \
+    LANUCH_LAYER_NORM_F16x8_PACK_F32_KERNEL(512)        \
+    break;                                              \
+  case 1024:                                            \
+    LANUCH_LAYER_NORM_F16x8_PACK_F32_KERNEL(1024)       \
+    break;                                              \
+  case 2048:                                            \
+    LANUCH_LAYER_NORM_F16x8_PACK_F32_KERNEL(2048)       \
+    break;                                              \
+  case 4096:                                            \
+    LANUCH_LAYER_NORM_F16x8_PACK_F32_KERNEL(4096)       \
+    break;                                              \
+  case 8192:                                            \
+    LANUCH_LAYER_NORM_F16x8_PACK_F32_KERNEL(8192)       \
+    break;                                              \
+  default:                                              \
+    throw std::runtime_error(                           \
+      "only support K: 64/128/.../1024*8");             \
+    break;                                              \
+  } 
+
+void layer_norm_f32(torch::Tensor x, torch::Tensor y, float g, float b) {
+  CHECK_TORCH_TENSOR_DTYPE(x, torch::kFloat32)       
+  CHECK_TORCH_TENSOR_DTYPE(y, torch::kFloat32)
+  CHECK_TORCH_TENSOR_SHAPE(x, y)
+  const int N = x.size(0);
+  const int K = x.size(1);
+  DISPATCH_LAYER_NORM_F32_KERNEL(N, K)
+}
+
+void layer_norm_f32x4(torch::Tensor x, torch::Tensor y, float g, float b) {
+  CHECK_TORCH_TENSOR_DTYPE(x, torch::kFloat32)       
+  CHECK_TORCH_TENSOR_DTYPE(y, torch::kFloat32)
+  CHECK_TORCH_TENSOR_SHAPE(x, y)
+  const int N = x.size(0);
+  const int K = x.size(1);
+  DISPATCH_LAYER_NORM_F32x4_KERNEL(N, K)
+}
+
 void layer_norm_f16_f16(torch::Tensor x, torch::Tensor y, float g, float b) {
   CHECK_TORCH_TENSOR_DTYPE(x, torch::kHalf)       
   CHECK_TORCH_TENSOR_DTYPE(y, torch::kHalf)
@@ -699,6 +786,14 @@ void layer_norm_f16x8_pack_f16(torch::Tensor x, torch::Tensor y, float g, float
   DISPATCH_LAYER_NORM_F16x8_PACK_F16_KERNEL(N, K)
 }
 
+void layer_norm_f16x8_pack_f32(torch::Tensor x, torch::Tensor y, float g, float b) {
+  CHECK_TORCH_TENSOR_DTYPE(x, torch::kHalf)       
+  CHECK_TORCH_TENSOR_DTYPE(y, torch::kHalf)
+  CHECK_TORCH_TENSOR_SHAPE(x, y)
+  const int N = x.size(0);
+  const int K = x.size(1);
+  DISPATCH_LAYER_NORM_F16x8_PACK_F32_KERNEL(N, K)
+}
 
 void layer_norm_f16_f32(torch::Tensor x, torch::Tensor y, float g, float b) {
   CHECK_TORCH_TENSOR_DTYPE(x, torch::kHalf)       
@@ -713,9 +808,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   TORCH_BINDING_COMMON_EXTENSION(layer_norm_f32)
   TORCH_BINDING_COMMON_EXTENSION(layer_norm_f32x4)
   TORCH_BINDING_COMMON_EXTENSION(layer_norm_f16_f16)
+  TORCH_BINDING_COMMON_EXTENSION(layer_norm_f16_f32)
   TORCH_BINDING_COMMON_EXTENSION(layer_norm_f16x2_f16)
   TORCH_BINDING_COMMON_EXTENSION(layer_norm_f16x8_f16)
   TORCH_BINDING_COMMON_EXTENSION(layer_norm_f16x8_pack_f16)
-  TORCH_BINDING_COMMON_EXTENSION(layer_norm_f16_f32)
+  TORCH_BINDING_COMMON_EXTENSION(layer_norm_f16x8_pack_f32)
 }
 
diff --git a/layer-norm/layer_norm.py b/layer-norm/layer_norm.py
@@ -82,6 +82,7 @@ def run_benchmark(perf_func: callable, x: torch.Tensor,
 run_benchmark(lib.layer_norm_f16x2_f16,      x_f16, "f16x2f16",     out_f16)
 run_benchmark(lib.layer_norm_f16x8_f16,      x_f16, "f16x8f16",     out_f16)
 run_benchmark(lib.layer_norm_f16x8_pack_f16, x_f16, "f16x8packf16", out_f16)
+run_benchmark(lib.layer_norm_f16x8_pack_f32, x_f16, "f16x8packf32", out_f16)
 run_benchmark(naive_layer_norm,              x_f16, "f16_th")
 print("-" * 85)
 
@@ -103,6 +104,7 @@ def run_benchmark(perf_func: callable, x: torch.Tensor,
 run_benchmark(lib.layer_norm_f16x2_f16,      x_f16, "f16x2f16",     out_f16)
 run_benchmark(lib.layer_norm_f16x8_f16,      x_f16, "f16x8f16",     out_f16)
 run_benchmark(lib.layer_norm_f16x8_pack_f16, x_f16, "f16x8packf16", out_f16)
+run_benchmark(lib.layer_norm_f16x8_pack_f32, x_f16, "f16x8packf32", out_f16)
 run_benchmark(naive_layer_norm,              x_f16, "f16_th")
 print("-" * 85)
 
@@ -121,6 +123,7 @@ def run_benchmark(perf_func: callable, x: torch.Tensor,
 run_benchmark(lib.layer_norm_f16x2_f16,      x_f16, "f16x2f16",     out_f16)
 run_benchmark(lib.layer_norm_f16x8_f16,      x_f16, "f16x8f16",     out_f16)
 run_benchmark(lib.layer_norm_f16x8_pack_f16, x_f16, "f16x8packf16", out_f16)
+run_benchmark(lib.layer_norm_f16x8_pack_f32, x_f16, "f16x8packf32", out_f16)
 run_benchmark(naive_layer_norm,              x_f16, "f16_th")
 print("-" * 85)
 
@@ -138,6 +141,7 @@ def run_benchmark(perf_func: callable, x: torch.Tensor,
 out_f16 = out.half()
 run_benchmark(lib.layer_norm_f16x8_f16,      x_f16, "f16x8f16",     out_f16)
 run_benchmark(lib.layer_norm_f16x8_pack_f16, x_f16, "f16x8packf16", out_f16)
+run_benchmark(lib.layer_norm_f16x8_pack_f32, x_f16, "f16x8packf32", out_f16)
 run_benchmark(naive_layer_norm,              x_f16, "f16_th")
 print("-" * 85)
 
@@ -149,6 +153,7 @@ def run_benchmark(perf_func: callable, x: torch.Tensor,
 out_f16 = torch.zeros_like(x_f16).cuda().half().contiguous()
 run_benchmark(lib.layer_norm_f16x8_f16,      x_f16, "f16x8f16",     out_f16)
 run_benchmark(lib.layer_norm_f16x8_pack_f16, x_f16, "f16x8packf16", out_f16)
+run_benchmark(lib.layer_norm_f16x8_pack_f32, x_f16, "f16x8packf32", out_f16)
 run_benchmark(naive_layer_norm,              x_f16, "f16_th")
 print("-" * 85)
 
@@ -160,5 +165,6 @@ def run_benchmark(perf_func: callable, x: torch.Tensor,
 out_f16 = torch.zeros_like(x_f16).cuda().half().contiguous()
 run_benchmark(lib.layer_norm_f16x8_f16,      x_f16, "f16x8f16",     out_f16)
 run_benchmark(lib.layer_norm_f16x8_pack_f16, x_f16, "f16x8packf16", out_f16)
+run_benchmark(lib.layer_norm_f16x8_pack_f32, x_f16, "f16x8packf32", out_f16)
 run_benchmark(naive_layer_norm,              x_f16, "f16_th")
 print("-" * 85)