fix comments (#345)

DefTruth · web-flow · commit 0d3e12869026 · 2025-06-27T19:01:19.000+08:00
diff --git a/kernels/elementwise/elementwise.cu b/kernels/elementwise/elementwise.cu
@@ -17,8 +17,8 @@
 #define BFLOAT2(value) (reinterpret_cast<__nv_bfloat162 *>(&(value))[0])
 #define LDST128BITS(value) (reinterpret_cast<float4 *>(&(value))[0])
 
-// -------------------------------------- FP32
-// -------------------------------------- ElementWise Add grid(N/256),
+// FP32
+// ElementWise Add grid(N/256),
 // block(256) a: Nx1, b: Nx1, c: Nx1, c = elementwise_add(a, b)
 __global__ void elementwise_add_f32_kernel(float *a, float *b, float *c,
                                            int N) {
@@ -45,8 +45,8 @@ __global__ void elementwise_add_f32x4_kernel(float *a, float *b, float *c,
   }
 }
 
-// -------------------------------------- FP16
-// -------------------------------------- ElementWise Add grid(N/256),
+// FP16
+// ElementWise Add grid(N/256),
 // block(256) a: Nx1, b: Nx1, c: Nx1, c = elementwise_add(a, b)
 __global__ void elementwise_add_f16_kernel(half *a, half *b, half *c, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/kernels/elu/elu.cu b/kernels/elu/elu.cu
@@ -31,13 +31,13 @@
   m.def(STRINGFY(func), &func, STRINGFY(func));
 
 // ELU 计算函数
-// -------------------------------------- FP32
+// FP32
 // --------------------------------------
 __device__ __forceinline__ float elu(float x) {
   return x > 0.f ? x : ALPHA * (expf(x) - 1.f);
 }
 
-// -------------------------------------- FP16
+// FP16
 // --------------------------------------
 __device__ __forceinline__ half elu_half(half x) {
   return __hgt(x, __float2half(0.f))
@@ -46,7 +46,7 @@ __device__ __forceinline__ half elu_half(half x) {
 }
 
 // CUDA 核函数
-// -------------------------------------- FP32
+// FP32
 // --------------------------------------
 __global__ void elu_f32_kernel(float *x, float *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -67,7 +67,7 @@ __global__ void elu_f32x4_kernel(float *x, float *y, int N) {
   }
 }
 
-// -------------------------------------- FP16
+// FP16
 // --------------------------------------
 __global__ void elu_f16_kernel(half *x, half *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/kernels/gelu/gelu.cu b/kernels/gelu/gelu.cu
@@ -57,8 +57,8 @@ __inline__ __device__ float gelu_none_approximate(float x) {
   return x * 0.5 * (1 + erff(x * M_SQRT1_2));
 }
 
-// -------------------------------------- FP32
-// -------------------------------------- GELU tanh approximate: x, y:x 0.5 * x
+// FP32
+// GELU tanh approximate: x, y:x 0.5 * x
 // * (1.0 + tanh(0.7978845608 * x * (1.0 + 0.044715 * x * x))) grid(N/256),
 // block(K=256)
 __global__ void gelu_f32_kernel(float *x, float *y, int N) {
@@ -91,8 +91,8 @@ __global__ void gelu_f32x4_kernel(float *x, float *y, int N) {
   }
 }
 
-// -------------------------------------- FP16
-// -------------------------------------- GELU approximate: x, y:x 0.5 * x *
+// FP16
+// GELU approximate: x, y:x 0.5 * x *
 // (1.0 + tanh(0.7978845608 (x + 0.044715 * x * x * x))) Vec4
 __global__ void gelu_f16_kernel(half *x, half *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/kernels/hardshrink/hardshrink.cu b/kernels/hardshrink/hardshrink.cu
@@ -31,8 +31,7 @@
   m.def(STRINGFY(func), &func, STRINGFY(func));
 
 // HARDSHRINK 计算函数
-// -------------------------------------- FP32
-// --------------------------------------
+// FP32
 __device__ __forceinline__ float hardshrink(float x) {
   if (x > LAMBD || x < -LAMBD) {
     return x;
@@ -41,8 +40,7 @@ __device__ __forceinline__ float hardshrink(float x) {
   }
 }
 
-// -------------------------------------- FP16
-// --------------------------------------
+// FP16
 __device__ __forceinline__ half hardshrink_half(half x) {
   if (x > __float2half(LAMBD) || x < __float2half(-LAMBD)) {
     return x;
@@ -52,8 +50,7 @@ __device__ __forceinline__ half hardshrink_half(half x) {
 }
 
 // CUDA 核函数
-// -------------------------------------- FP32
-// --------------------------------------
+// FP32
 __global__ void hardshrink_f32_kernel(float *x, float *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < N)
@@ -73,8 +70,7 @@ __global__ void hardshrink_f32x4_kernel(float *x, float *y, int N) {
   }
 }
 
-// -------------------------------------- FP16
-// --------------------------------------
+// FP16
 __global__ void hardshrink_f16_kernel(half *x, half *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < N)
diff --git a/kernels/hardswish/hardswish.cu b/kernels/hardswish/hardswish.cu
@@ -32,8 +32,7 @@
   m.def(STRINGFY(func), &func, STRINGFY(func));
 
 // HARDSWISH 计算函数
-// -------------------------------------- FP32
-// --------------------------------------
+//  FP32
 __device__ __forceinline__ float hardswish(float x) {
   if (x >= THRESHOLD_A) {
     return x;
@@ -44,8 +43,7 @@ __device__ __forceinline__ float hardswish(float x) {
   }
 }
 
-// -------------------------------------- FP16
-// --------------------------------------
+//  FP16
 __device__ __forceinline__ half hardswish_half(half x) {
   if (x > __float2half(THRESHOLD_A)) {
     return x;
@@ -57,8 +55,7 @@ __device__ __forceinline__ half hardswish_half(half x) {
 }
 
 // CUDA 核函数
-// -------------------------------------- FP32
-// --------------------------------------
+//  FP32
 __global__ void hardswish_f32_kernel(float *x, float *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < N)
@@ -78,8 +75,7 @@ __global__ void hardswish_f32x4_kernel(float *x, float *y, int N) {
   }
 }
 
-// -------------------------------------- FP16
-// --------------------------------------
+//  FP16
 __global__ void hardswish_f16_kernel(half *x, half *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < N)
diff --git a/kernels/histogram/histogram.cu b/kernels/histogram/histogram.cu
@@ -35,8 +35,7 @@ __global__ void histogram_i32x4_kernel(int *a, int *y, int N) {
   }
 }
 
-// --------------------- PyTorch bindings for custom kernel
-// -----------------------
+// PyTorch bindings for custom kernel
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func)                                   \
   m.def(STRINGFY(func), &func, STRINGFY(func));
diff --git a/kernels/layer-norm/layer_norm.cu b/kernels/layer-norm/layer_norm.cu
@@ -17,8 +17,8 @@
 #define BFLOAT2(value) (reinterpret_cast<__nv_bfloat162 *>(&(value))[0])
 #define LDST128BITS(value) (reinterpret_cast<float4 *>(&(value))[0])
 
-// -------------------------------------- FP32
-// -------------------------------------- Warp Reduce Sum
+//  FP32
+//  Warp Reduce Sum
 template <const int kWarpSize = WARP_SIZE>
 __device__ __forceinline__ float warp_reduce_sum_f32(float val) {
 #pragma unroll
@@ -119,8 +119,8 @@ __global__ void layer_norm_f32x4_kernel(float *x, float *y, float g, float b,
     FLOAT4(y[idx]) = reg_y;
 }
 
-// -------------------------------------- FP16
-// -------------------------------------- Warp Reduce Sum: Half
+//  FP16
+//  Warp Reduce Sum: Half
 template <const int kWarpSize = WARP_SIZE>
 __device__ __forceinline__ half warp_reduce_sum_f16_f16(half val) {
 #pragma unroll
diff --git a/kernels/mat-transpose/mat_transpose.cu b/kernels/mat-transpose/mat_transpose.cu
@@ -23,8 +23,8 @@
 #define MAX_EXP_F16 __float2half(11.089866488461016f)
 #define MIN_EXP_F16 __float2half(-9.704060527839234f)
 
-// -------------------------------------- FP32
-// -------------------------------------- col2row means read x[row][col] and
+//  FP32
+//  col2row means read x[row][col] and
 // write y[col][row] row2col means read x[col][row] and write y[row][col]
 __global__ void mat_transpose_f32_col2row_kernel(float *x, float *y,
                                                  const int row, const int col) {
@@ -216,7 +216,6 @@ __global__ void mat_transpose_f32x4_shared_row2col2d_kernel(float *x, float *y,
   }
 }
 
-
 __global__ void mat_transpose_f32x4_shared_bcf_col2row2d_kernel(float *x,
                                                                 float *y,
                                                                 const int row,
@@ -298,11 +297,8 @@ __global__ void mat_transpose_f32x4_shared_bcf_row2col2d_kernel(float *x,
   }
 }
 
-
-__global__ void mat_transpose_f32x4_shared_bcf_merge_write_row2col2d_kernel(float *x,
-                                                                float *y,
-                                                                const int row,
-                                                                const int col) {
+__global__ void mat_transpose_f32x4_shared_bcf_merge_write_row2col2d_kernel(
+    float *x, float *y, const int row, const int col) {
   const int global_x = blockIdx.x * blockDim.x + threadIdx.x;
   const int global_y = blockIdx.y * blockDim.y + threadIdx.y;
   const int local_x = threadIdx.x;
@@ -328,18 +324,13 @@ __global__ void mat_transpose_f32x4_shared_bcf_merge_write_row2col2d_kernel(floa
     smem_val.w = tile[local_x * 4 + 3][local_y];
 
     const int gid_x = blockIdx.x * blockDim.x;
-    const int gid_y = blockIdx.y * blockDim.y * 4;    
+    const int gid_y = blockIdx.y * blockDim.y * 4;
     const int out_y = gid_y + local_x * 4;
     const int out_x = gid_x + local_y;
     reinterpret_cast<float4 *>(y)[(out_x * row + out_y) / 4] = FLOAT4(smem_val);
   }
 }
 
-// TODO: may support double buffer pipeline mat transpose ?
-// TODO: may support fp16 mat transpose ?
-
-// --------------------- PyTorch bindings for custom kernel
-// -----------------------
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func)                                   \
   m.def(STRINGFY(func), &func, STRINGFY(func));
@@ -373,7 +364,7 @@ __global__ void mat_transpose_f32x4_shared_bcf_merge_write_row2col2d_kernel(floa
     dim3 block(WARP_SIZE_S, WARP_SIZE_S);                                      \
     dim3 grid((N + WARP_SIZE_S - 1) / (WARP_SIZE_S * n_element_col),           \
               (M + WARP_SIZE_S - 1) / (WARP_SIZE_S * n_element_row));          \
-    mat_transpose_##tag##2d_kernel <<< grid,                                   \
+    mat_transpose_##tag##2d_kernel < < < grid,                                 \
         block >>> (reinterpret_cast<element_type *>(x.data_ptr()),             \
                    reinterpret_cast<element_type *>(y.data_ptr()), M, N);      \
   }
@@ -400,11 +391,8 @@ TORCH_BINDING_MAT_TRANSPOSE2D(f32x4_shared_bcf_col2row, torch::kFloat32, float,
                               1, 4)
 TORCH_BINDING_MAT_TRANSPOSE2D(f32x4_shared_bcf_row2col, torch::kFloat32, float,
                               4, 1)
-TORCH_BINDING_MAT_TRANSPOSE2D(f32x4_shared_bcf_merge_write_row2col, torch::kFloat32, float,
-                              4, 1)
-
-// TODO: may support double buffer pipeline mat transpose ?
-// TODO: may support fp16 mat transpose ?
+TORCH_BINDING_MAT_TRANSPOSE2D(f32x4_shared_bcf_merge_write_row2col,
+                              torch::kFloat32, float, 4, 1)
 
 // CuTe implentations
 extern void mat_transpose_cute_col2row_reg(torch::Tensor, torch::Tensor);
@@ -442,7 +430,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // shared memory optimize with bcf
   TORCH_BINDING_COMMON_EXTENSION(mat_transpose_f32x4_shared_bcf_col2row2d)
   TORCH_BINDING_COMMON_EXTENSION(mat_transpose_f32x4_shared_bcf_row2col2d)
-  TORCH_BINDING_COMMON_EXTENSION(mat_transpose_f32x4_shared_bcf_merge_write_row2col2d)
+  TORCH_BINDING_COMMON_EXTENSION(
+      mat_transpose_f32x4_shared_bcf_merge_write_row2col2d)
   // CuTe implentations
   TORCH_BINDING_COMMON_EXTENSION(mat_transpose_cute_col2row_reg)
   TORCH_BINDING_COMMON_EXTENSION(mat_transpose_cute_row2col_reg)
diff --git a/kernels/nms/nms.cu b/kernels/nms/nms.cu
@@ -58,8 +58,6 @@ __global__ void nms_kernel(const float *boxes, const float *scores, int *keep,
   return;
 }
 
-// --------------------- PyTorch bindings for custom kernel
-// -----------------------
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func)                                   \
   m.def(STRINGFY(func), &func, STRINGFY(func));
diff --git a/kernels/reduce/block_all_reduce.cu b/kernels/reduce/block_all_reduce.cu
@@ -25,8 +25,8 @@
 //           MatMul FP8 -> Tensor Cores
 
 // CUDA温故(0x00): 一步步学习block all reduce: 从FP32到FP16/BF16，再到FP8
-// -------------------------------------- FP32
-// -------------------------------------- Warp Reduce Sum
+//  FP32
+//  Warp Reduce Sum
 template <const int kWarpSize = WARP_SIZE>
 __device__ __forceinline__ float warp_reduce_sum_f32(float val) {
 #pragma unroll
@@ -93,8 +93,8 @@ __global__ void block_all_reduce_sum_f32x4_f32_kernel(float *a, float *y,
     atomicAdd(y, sum);
 }
 
-// -------------------------------------- FP16
-// -------------------------------------- Warp Reduce Sum: Half
+//  FP16
+//  Warp Reduce Sum: Half
 template <const int kWarpSize = WARP_SIZE>
 __device__ __forceinline__ half warp_reduce_sum_f16_f16(half val) {
 #pragma unroll
@@ -301,8 +301,8 @@ __global__ void block_all_reduce_sum_f16x8_pack_f32_kernel(half *a, float *y,
     atomicAdd(y, sum);
 }
 
-// -------------------------------------- BF16
-// -------------------------------------- Warp Reduce Sum: Half
+//  BF16
+//  Warp Reduce Sum: Half
 template <const int kWarpSize = WARP_SIZE>
 __device__ __forceinline__ __nv_bfloat16
 warp_reduce_sum_bf16_bf16(__nv_bfloat16 val) {
@@ -520,8 +520,8 @@ __global__ void block_all_reduce_sum_bf16x8_pack_f32_kernel(__nv_bfloat16 *a,
     atomicAdd(y, sum);
 }
 
-// -------------------------------------- FP8
-// --------------------------------------
+//  FP8
+//
 template <const int kWarpSize = WARP_SIZE>
 __device__ __forceinline__ half
 warp_reduce_sum_fp8_e4m3_f16(__nv_fp8_storage_t val) {
@@ -680,8 +680,8 @@ block_all_reduce_sum_fp8_e5m2x16_pack_f16_kernel(__nv_fp8_storage_t *a,
     atomicAdd(y, __half2float(sum));
 }
 
-// -------------------------------------- INT8
-// --------------------------------------
+//  INT8
+//
 template <const int kWarpSize = WARP_SIZE>
 __device__ __forceinline__ int32_t warp_reduce_sum_i8_i32(int8_t val) {
   int32_t val_i32 = static_cast<int32_t>(val);
diff --git a/kernels/relu/relu.cu b/kernels/relu/relu.cu
@@ -15,8 +15,8 @@
 #define BFLOAT2(value) (reinterpret_cast<__nv_bfloat162 *>(&(value))[0])
 #define LDST128BITS(value) (reinterpret_cast<float4 *>(&(value))[0])
 
-// -------------------------------------- FP32
-// -------------------------------------- Relu x: N, y: N y=max(0,x)
+//  FP32
+//  Relu x: N, y: N y=max(0,x)
 // grid(N/256), block(K=256)
 __global__ void relu_f32_kernel(float *x, float *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -39,8 +39,7 @@ __global__ void relu_f32x4_kernel(float *x, float *y, int N) {
   }
 }
 
-// -------------------------------------- FP16
-// --------------------------------------
+//  FP16
 __global__ void relu_f16_kernel(half *x, half *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < N)
@@ -106,8 +105,6 @@ __global__ void relu_f16x8_pack_kernel(half *x, half *y, int N) {
   }
 }
 
-// --------------------- PyTorch bindings for custom kernel
-// -----------------------
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func)                                   \
   m.def(STRINGFY(func), &func, STRINGFY(func));
diff --git a/kernels/rms-norm/rms_norm.cu b/kernels/rms-norm/rms_norm.cu
@@ -17,8 +17,8 @@
 #define BFLOAT2(value) (reinterpret_cast<__nv_bfloat162 *>(&(value))[0])
 #define LDST128BITS(value) (reinterpret_cast<float4 *>(&(value))[0])
 
-// -------------------------------------- FP32
-// -------------------------------------- Warp Reduce Sum
+//  FP32
+//  Warp Reduce Sum
 template <const int kWarpSize = WARP_SIZE>
 __device__ __forceinline__ float warp_reduce_sum_f32(float val) {
 #pragma unroll
@@ -101,8 +101,8 @@ __global__ void rms_norm_f32x4_kernel(float *x, float *y, float g, int N,
     FLOAT4(y[idx]) = reg_y;
 }
 
-// -------------------------------------- FP16
-// -------------------------------------- Warp Reduce Sum: Half
+//  FP16
+//  Warp Reduce Sum: Half
 template <const int kWarpSize = WARP_SIZE>
 __device__ __forceinline__ half warp_reduce_sum_f16_f16(half val) {
 #pragma unroll
diff --git a/kernels/sgemm/sgemm.cu b/kernels/sgemm/sgemm.cu
diff --git a/kernels/sgemm/sgemm_cublas.cu b/kernels/sgemm/sgemm_cublas.cu
diff --git a/kernels/sgemv/sgemv.cu b/kernels/sgemv/sgemv.cu
diff --git a/kernels/sigmoid/sigmoid.cu b/kernels/sigmoid/sigmoid.cu
diff --git a/kernels/softmax/softmax.cu b/kernels/softmax/softmax.cu
diff --git a/kernels/swish/swish.cu b/kernels/swish/swish.cu

Original file line number	Diff line number	Diff line change
`@@ -57,8 +57,8 @@ __inline__ __device__ float gelu_none_approximate(float x) {`
`57`	`57`	`return x * 0.5 * (1 + erff(x * M_SQRT1_2));`
`58`	`58`	`}`
`59`	`59`
`60`		`-// -------------------------------------- FP32`
`61`		`-// -------------------------------------- GELU tanh approximate: x, y:x 0.5 * x`
	`60`	`+// FP32`
	`61`	`+// GELU tanh approximate: x, y:x 0.5 * x`
`62`	`62`	`// * (1.0 + tanh(0.7978845608 * x * (1.0 + 0.044715 * x * x))) grid(N/256),`
`63`	`63`	`// block(K=256)`
`64`	`64`	`__global__ void gelu_f32_kernel(float x, float y, int N) {`
`@@ -91,8 +91,8 @@ __global__ void gelu_f32x4_kernel(float x, float y, int N) {`
`91`	`91`	`}`
`92`	`92`	`}`
`93`	`93`
`94`		`-// -------------------------------------- FP16`
`95`		`-// -------------------------------------- GELU approximate: x, y:x 0.5 * x *`
	`94`	`+// FP16`
	`95`	`+// GELU approximate: x, y:x 0.5 * x *`
`96`	`96`	`// (1.0 + tanh(0.7978845608 (x + 0.044715 * x * x * x))) Vec4`
`97`	`97`	`__global__ void gelu_f16_kernel(half x, half y, int N) {`
`98`	`98`	`int idx = blockIdx.x * blockDim.x + threadIdx.x;`
Original file line number	Diff line number	Diff line change
`@@ -35,8 +35,7 @@ __global__ void histogram_i32x4_kernel(int a, int y, int N) {`
`35`	`35`	`}`
`36`	`36`	`}`
`37`	`37`
`38`		`-// --------------------- PyTorch bindings for custom kernel`
`39`		`-// -----------------------`
	`38`	`+// PyTorch bindings for custom kernel`
`40`	`39`	`#define STRINGFY(str) #str`
`41`	`40`	`#define TORCH_BINDING_COMMON_EXTENSION(func) \`
`42`	`41`	`m.def(STRINGFY(func), &func, STRINGFY(func));`
Original file line number	Diff line number	Diff line change
`@@ -58,8 +58,6 @@ __global__ void nms_kernel(const float boxes, const float scores, int *keep,`
`58`	`58`	`return;`
`59`	`59`	`}`
`60`	`60`
`61`		`-// --------------------- PyTorch bindings for custom kernel`
`62`		`-// -----------------------`
`63`	`61`	`#define STRINGFY(str) #str`
`64`	`62`	`#define TORCH_BINDING_COMMON_EXTENSION(func) \`
`65`	`63`	`m.def(STRINGFY(func), &func, STRINGFY(func));`