fix comments (#346)

DefTruth · web-flow · commit edc46f34b422 · 2025-06-27T19:18:32.000+08:00
diff --git a/kernels/elementwise/elementwise.cu b/kernels/elementwise/elementwise.cu
@@ -120,8 +120,6 @@ __global__ void elementwise_add_f16x8_pack_kernel(half *a, half *b, half *c,
   }
 }
 
-// --------------------- PyTorch bindings for custom kernel
-// -----------------------
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func)                                   \
   m.def(STRINGFY(func), &func, STRINGFY(func));
diff --git a/kernels/elu/elu.cu b/kernels/elu/elu.cu
@@ -32,13 +32,11 @@
 
 // ELU 计算函数
 // FP32
-// --------------------------------------
 __device__ __forceinline__ float elu(float x) {
   return x > 0.f ? x : ALPHA * (expf(x) - 1.f);
 }
 
 // FP16
-// --------------------------------------
 __device__ __forceinline__ half elu_half(half x) {
   return __hgt(x, __float2half(0.f))
              ? x
@@ -47,7 +45,6 @@ __device__ __forceinline__ half elu_half(half x) {
 
 // CUDA 核函数
 // FP32
-// --------------------------------------
 __global__ void elu_f32_kernel(float *x, float *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < N)
@@ -68,7 +65,6 @@ __global__ void elu_f32x4_kernel(float *x, float *y, int N) {
 }
 
 // FP16
-// --------------------------------------
 __global__ void elu_f16_kernel(half *x, half *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < N)
@@ -129,7 +125,6 @@ __global__ void elu_f16x8_pack_kernel(half *x, half *y, int N) {
   }
 }
 
-// PyTorch 绑定代码
 #define TORCH_BINDING_ELU(packed_type, th_type, element_type, n_elements)      \
   void elu_##packed_type(torch::Tensor x, torch::Tensor y) {                   \
     CHECK_TORCH_TENSOR_DTYPE(x, (th_type))                                     \
diff --git a/kernels/embedding/embedding.cu b/kernels/embedding/embedding.cu
@@ -78,8 +78,6 @@ __global__ void embedding_f16x8_pack_kernel(const int *idx, half *weight,
       LDST128BITS(weight[offset + 8 * tx]);
 }
 
-// --------------------- PyTorch bindings for custom kernel
-// -----------------------
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func)                                   \
   m.def(STRINGFY(func), &func, STRINGFY(func));
diff --git a/kernels/gelu/gelu.cu b/kernels/gelu/gelu.cu
@@ -182,8 +182,6 @@ __global__ void gelu_f16x8_pack_kernel(half *x, half *y, int N) {
   }
 }
 
-// --------------------- PyTorch bindings for custom kernel
-// -----------------------
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func)                                   \
   m.def(STRINGFY(func), &func, STRINGFY(func));
diff --git a/kernels/hardshrink/hardshrink.cu b/kernels/hardshrink/hardshrink.cu
@@ -131,7 +131,6 @@ __global__ void hardshrink_f16x8_pack_kernel(half *x, half *y, int N) {
   }
 }
 
-// PyTorch 绑定代码
 #define TORCH_BINDING_HARDSHRINK(packed_type, th_type, element_type,           \
                                  n_elements)                                   \
   void hardshrink_##packed_type(torch::Tensor x, torch::Tensor y) {            \
diff --git a/kernels/hardswish/hardswish.cu b/kernels/hardswish/hardswish.cu
@@ -136,7 +136,6 @@ __global__ void hardswish_f16x8_pack_kernel(half *x, half *y, int N) {
   }
 }
 
-// PyTorch 绑定代码
 #define TORCH_BINDING_HARDSWISH(packed_type, th_type, element_type,            \
                                 n_elements)                                    \
   void hardswish_##packed_type(torch::Tensor x, torch::Tensor y) {             \
diff --git a/kernels/histogram/histogram.cu b/kernels/histogram/histogram.cu
@@ -35,7 +35,6 @@ __global__ void histogram_i32x4_kernel(int *a, int *y, int N) {
   }
 }
 
-// PyTorch bindings for custom kernel
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func)                                   \
   m.def(STRINGFY(func), &func, STRINGFY(func));
diff --git a/kernels/layer-norm/layer_norm.cu b/kernels/layer-norm/layer_norm.cu
@@ -456,8 +456,6 @@ __global__ void layer_norm_f16x8_pack_f32_kernel(half *x, half *y, float g,
   // TODO: support non 8-multiple K here
 }
 
-// --------------------- PyTorch bindings for custom kernel
-// -----------------------
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func)                                   \
   m.def(STRINGFY(func), &func, STRINGFY(func));
diff --git a/kernels/nvidia-nsight/elementwise.cu b/kernels/nvidia-nsight/elementwise.cu
@@ -16,8 +16,8 @@
 #define BFLOAT2(value) (reinterpret_cast<__nv_bfloat162 *>(&(value))[0])
 #define LDST128BITS(value) (reinterpret_cast<float4 *>(&(value))[0])
 
-// -------------------------------------- FP32
-// -------------------------------------- ElementWise Add grid(N/256),
+// FP32
+// ElementWise Add grid(N/256),
 // block(256) a: Nx1, b: Nx1, c: Nx1, c = elementwise_add(a, b)
 __global__ void elementwise_add_f32_kernel(float *a, float *b, float *c,
                                            int N) {
@@ -44,8 +44,8 @@ __global__ void elementwise_add_f32x4_kernel(float *a, float *b, float *c,
   }
 }
 
-// -------------------------------------- FP16
-// -------------------------------------- ElementWise Add grid(N/256),
+// FP16
+// ElementWise Add grid(N/256),
 // block(256) a: Nx1, b: Nx1, c: Nx1, c = elementwise_add(a, b)
 __global__ void elementwise_add_f16_kernel(half *a, half *b, half *c, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/kernels/nvidia-nsight/relu.cu b/kernels/nvidia-nsight/relu.cu
@@ -14,8 +14,8 @@
 #define BFLOAT2(value) (reinterpret_cast<__nv_bfloat162 *>(&(value))[0])
 #define LDST128BITS(value) (reinterpret_cast<float4 *>(&(value))[0])
 
-// -------------------------------------- FP32
-// -------------------------------------- Relu x: N, y: N y=max(0,x)
+// FP32
+// Relu x: N, y: N y=max(0,x)
 // grid(N/256), block(K=256)
 __global__ void relu_f32_kernel(float *x, float *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -38,8 +38,7 @@ __global__ void relu_f32x4_kernel(float *x, float *y, int N) {
   }
 }
 
-// -------------------------------------- FP16
-// --------------------------------------
+// FP16
 __global__ void relu_f16_kernel(half *x, half *y, int N) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < N)
diff --git a/kernels/reduce/block_all_reduce.cu b/kernels/reduce/block_all_reduce.cu
@@ -17,14 +17,6 @@
 #define BFLOAT2(value) (reinterpret_cast<__nv_bfloat162 *>(&(value))[0])
 #define LDST128BITS(value) (reinterpret_cast<float4 *>(&(value))[0])
 
-// FP16/BF16 CUDA Cores/Tensor Cores:
-// https://resources.nvidia.com/en-us-tensor-core
-// Non MatMul FP16/BF16 -> CUDA Cores
-//     MatMul FP16/BF16 -> Tensor Cores
-//       Non MatMul FP8 -> Not supported
-//           MatMul FP8 -> Tensor Cores
-
-// CUDA温故(0x00): 一步步学习block all reduce: 从FP32到FP16/BF16，再到FP8
 //  FP32
 //  Warp Reduce Sum
 template <const int kWarpSize = WARP_SIZE>

Original file line number	Diff line number	Diff line change
`@@ -120,8 +120,6 @@ __global__ void elementwise_add_f16x8_pack_kernel(half a, half b, half *c,`
`120`	`120`	`}`
`121`	`121`	`}`
`122`	`122`
`123`		`-// --------------------- PyTorch bindings for custom kernel`
`124`		`-// -----------------------`
`125`	`123`	`#define STRINGFY(str) #str`
`126`	`124`	`#define TORCH_BINDING_COMMON_EXTENSION(func) \`
`127`	`125`	`m.def(STRINGFY(func), &func, STRINGFY(func));`
Original file line number	Diff line number	Diff line change
`@@ -78,8 +78,6 @@ __global__ void embedding_f16x8_pack_kernel(const int idx, half weight,`
`78`	`78`	`LDST128BITS(weight[offset + 8 * tx]);`
`79`	`79`	`}`
`80`	`80`
`81`		`-// --------------------- PyTorch bindings for custom kernel`
`82`		`-// -----------------------`
`83`	`81`	`#define STRINGFY(str) #str`
`84`	`82`	`#define TORCH_BINDING_COMMON_EXTENSION(func) \`
`85`	`83`	`m.def(STRINGFY(func), &func, STRINGFY(func));`
Original file line number	Diff line number	Diff line change
`@@ -182,8 +182,6 @@ __global__ void gelu_f16x8_pack_kernel(half x, half y, int N) {`
`182`	`182`	`}`
`183`	`183`	`}`
`184`	`184`
`185`		`-// --------------------- PyTorch bindings for custom kernel`
`186`		`-// -----------------------`
`187`	`185`	`#define STRINGFY(str) #str`
`188`	`186`	`#define TORCH_BINDING_COMMON_EXTENSION(func) \`
`189`	`187`	`m.def(STRINGFY(func), &func, STRINGFY(func));`
Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,6 @@ __global__ void hardshrink_f16x8_pack_kernel(half x, half y, int N) {`
`131`	`131`	`}`
`132`	`132`	`}`
`133`	`133`
`134`		`-// PyTorch 绑定代码`
`135`	`134`	`#define TORCH_BINDING_HARDSHRINK(packed_type, th_type, element_type, \`
`136`	`135`	`n_elements) \`
`137`	`136`	`void hardshrink_##packed_type(torch::Tensor x, torch::Tensor y) { \`
Original file line number	Diff line number	Diff line change
`@@ -136,7 +136,6 @@ __global__ void hardswish_f16x8_pack_kernel(half x, half y, int N) {`
`136`	`136`	`}`
`137`	`137`	`}`
`138`	`138`
`139`		`-// PyTorch 绑定代码`
`140`	`139`	`#define TORCH_BINDING_HARDSWISH(packed_type, th_type, element_type, \`
`141`	`140`	`n_elements) \`
`142`	`141`	`void hardswish_##packed_type(torch::Tensor x, torch::Tensor y) { \`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,6 @@ __global__ void histogram_i32x4_kernel(int a, int y, int N) {`
`35`	`35`	`}`
`36`	`36`	`}`
`37`	`37`
`38`		`-// PyTorch bindings for custom kernel`
`39`	`38`	`#define STRINGFY(str) #str`
`40`	`39`	`#define TORCH_BINDING_COMMON_EXTENSION(func) \`
`41`	`40`	`m.def(STRINGFY(func), &func, STRINGFY(func));`
Original file line number	Diff line number	Diff line change
`@@ -456,8 +456,6 @@ __global__ void layer_norm_f16x8_pack_f32_kernel(half x, half y, float g,`
`456`	`456`	`// TODO: support non 8-multiple K here`
`457`	`457`	`}`
`458`	`458`
`459`		`-// --------------------- PyTorch bindings for custom kernel`
`460`		`-// -----------------------`
`461`	`459`	`#define STRINGFY(str) #str`
`462`	`460`	`#define TORCH_BINDING_COMMON_EXTENSION(func) \`
`463`	`461`	`m.def(STRINGFY(func), &func, STRINGFY(func));`