Feat: Added cuda kernels

Phylliida · Phylliida · commit 1b62b49aa82b · 2025-11-03T14:34:56.000-08:00
diff --git a/ggml/src/ggml-cuda/conv2d-dw.cu b/ggml/src/ggml-cuda/conv2d-dw.cu
@@ -8,6 +8,7 @@ struct conv_params {
     int padding_x, padding_y;
     int dilation_x, dilation_y;
     int channels, batches;
+    int circular;
 };
 
 struct kernel_bounds {
@@ -17,21 +18,34 @@ struct kernel_bounds {
 
 __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
     kernel_bounds bounds;
-    bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
-    bounds.y_max =
-        min(params.kernel_h,
-            (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
-    bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
-    bounds.x_max =
-        min(params.kernel_w,
-            (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+    if (params.circular) {
+        bounds.y_min = 0;
+        bounds.y_max = params.kernel_h;
+        bounds.x_min = 0;
+        bounds.x_max = params.kernel_w;
+    }
+    else {
+        bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+        bounds.y_max =
+            min(params.kernel_h,
+                (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+        bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+        bounds.x_max =
+            min(params.kernel_w,
+                (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+
+    }
     return bounds;
 }
 
 __device__ __forceinline__ int calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
     return out_coord * stride + kern_coord * dilation - padding;
 }
 
+__device__ __forceinline__ int wrap_coord(int coord, int size) {
+    return (coord % size + size) % size;
+}
+
 struct whcn_layout {
     __device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
         return n * (params.channels * params.in_w * params.in_h) + c * params.in_w * params.in_h + y * params.in_w + x;
@@ -83,7 +97,8 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
                                  const int in_w, const int in_h, const int out_w, const int out_h,
                                  const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
                                  const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
-                                 const int channels, const int batches) {
+                                 const int channels, const int batches,
+                                 const int circular) {
     const int global_idx     = blockIdx.x * blockDim.x + threadIdx.x;
     const int total_elements = batches * channels * out_h * out_w;
 
@@ -92,26 +107,43 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
     }
 
     conv_params params = { in_w,     in_h,      out_w,     out_h,      kernel_w,   kernel_h, stride_x,
-                           stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches };
+                           stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches, circular };
 
     int batch_idx, channel_idx, out_y_idx, out_x_idx;
     Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx);
 
     T accumulator = 0;
     kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params);
 
-    for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
-        int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
+    if (params.circular == 0) {
+        for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
+            int src_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
+
+            for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
+                int src_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
+
+                const T input_val  = input[Layout::input_index(batch_idx, channel_idx, src_y_idx, src_x_idx, params)];
+                const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
+
+                accumulator += input_val * kernel_val;
+            }
+        }
+    }
+    else {
+        for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
+            int in_y_idx = wrap_coord(calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y), params.in_h);
 
-        for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
-            int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
+            for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
+                int in_x_idx = wrap_coord(calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x), params.in_w);
 
-            const T input_val  = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)];
-            const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
+                const T input_val  = input[Layout::input_index(batch_idx, channel_idx, src_y_idx, src_x_idx, params)];
+                const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
 
-            accumulator += input_val * kernel_val;
+                accumulator += input_val * kernel_val;
+            }
         }
     }
+    
 
     output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator;
 }
@@ -132,6 +164,7 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     const int       padding_y  = p[3];
     const int       dilation_x = p[4];
     const int       dilation_y = p[5];
+    const int       circular   = p[6];
 
     const int in_w     = input->ne[0];
     const int in_h     = input->ne[1];
@@ -150,11 +183,11 @@ void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
     if (ggml_is_contiguous(input)) {
         conv2d_dw_kernel<float, whcn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
             x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
-            dilation_x, dilation_y, channels, batches);
+            dilation_x, dilation_y, channels, batches, circular);
     } else if (ggml_is_contiguous_channels(input)) {
         conv2d_dw_kernel<float, cwhn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
             x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
-            dilation_x, dilation_y, channels, batches);
+            dilation_x, dilation_y, channels, batches, circular);
     } else {
         GGML_ABORT("Unsupported memory layout for conv_2d_dw");
     }
diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cu b/ggml/src/ggml-cuda/conv2d-transpose.cu
@@ -3,10 +3,16 @@
 #include "conv2d-transpose.cuh"
 #include "ggml.h"
 
+
+__device__ __forceinline__ int wrap_coord(int coord, int size) {
+    return (coord % size + size) % size;
+}
+
 __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel,
                                         float * __restrict__ output, const int in_w, const int in_h, const int out_w,
                                         const int out_h, const int kernel_w, const int kernel_h, const int stride,
-                                        const int c_in, const int c_out, const int batches) {
+                                        const int c_in, const int c_out, const int batches,
+                                        const int circular) {
     const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     const int total_elements = out_w * out_h * c_out * batches;
@@ -22,28 +28,55 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const
 
     float accumulator = 0;
     // For each output idx, find the inputs that contribute to it by checking stride alignment and bounds
-
-    for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
-        for (int kh = 0; kh < kernel_h; ++kh) {
-            int in_y = out_y_idx - kh;
-            if (in_y < 0 || in_y % stride) continue;
-            in_y /= stride;
-            if (in_y >= in_h) continue;
-
-            for (int kw = 0; kw < kernel_w; ++kw) {
-                int in_x = out_x_idx - kw;
-                if (in_x < 0 || in_x % stride) continue;
-                in_x /= stride;
-                if (in_x >= in_w) continue;
-
-                const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
-                const int kernel_idx =
-                    (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
-
-                float input_val = input[input_idx];
-                half  kern_val  = kernel[kernel_idx];
-
-                accumulator += input_val * (float) kern_val;
+    if (circular == 0) {
+        for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
+            for (int kh = 0; kh < kernel_h; ++kh) {
+                int in_y = out_y_idx - kh;
+                if (in_y < 0 || in_y % stride) continue;
+                in_y /= stride;
+                if (in_y >= in_h) continue;
+
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                    int in_x = out_x_idx - kw;
+                    if (in_x < 0 || in_x % stride) continue;
+                    in_x /= stride;
+                    if (in_x >= in_w) continue;
+
+                    const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
+                    const int kernel_idx =
+                        (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
+
+                    float input_val = input[input_idx];
+                    half  kern_val  = kernel[kernel_idx];
+
+                    accumulator += input_val * (float) kern_val;
+                }
+            }
+        }
+    }
+    else {
+        for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
+            for (int kh = 0; kh < kernel_h; ++kh) {
+                int in_y = out_y_idx - kh;
+                if (in_y % stride) continue;
+                in_y /= stride;
+                in_y = wrap_coord(in_y, in_h);
+
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                    int in_x = out_x_idx - kw;
+                    if (in_x % stride) continue;
+                    in_x /= stride;
+                    in_x = wrap_coord(in_x, in_w);
+
+                    const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
+                    const int kernel_idx =
+                        (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
+
+                    float input_val = input[input_idx];
+                    half  kern_val  = kernel[kernel_idx];
+
+                    accumulator += input_val * (float) kern_val;
+                }
             }
         }
     }
@@ -72,6 +105,7 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor
     const int kernel_h     = kernel->ne[1];
     const int stride       = dst->op_params[0];
     const int batches      = input->ne[3];
+    const int circular     = dst->op_params[1];
 
     GGML_ASSERT(channels_in == kernel->ne[3]);
     GGML_ASSERT(stride > 0);
@@ -87,5 +121,5 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor
 
     conv2d_transpose_kernel<<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
         input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride,
-        channels_in, channels_out, batches);
+        channels_in, channels_out, batches, circular);
 }
diff --git a/ggml/src/ggml-cuda/conv2d.cu b/ggml/src/ggml-cuda/conv2d.cu
@@ -11,6 +11,7 @@ struct conv_params {
     const int64_t IC, OC;
     const int64_t B;
     const int64_t TOTAL;
+    const int64_t CIRCULAR;
 };
 
 struct kernel_bounds {
@@ -26,12 +27,24 @@ __device__ __forceinline__ int64_t min64(int64_t a, int64_t b) {
     return (a < b) ? a : b;
 }
 
+__device__ __forceinline__ int wrap_coord(int coord, int size) {
+    return (coord % size + size) % size;
+}
+
 __device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv_params & P) {
     kernel_bounds bounds;
-    bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
-    bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
-    bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
-    bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+    if (P.CIRCULAR) {
+        bounds.y_min = 0;
+        bounds.y_max = P.KH;
+        bounds.x_min = 0;
+        bounds.x_max = P.KW;
+    }
+    else {
+        bounds.y_min = max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
+        bounds.y_max = min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y);
+        bounds.x_min = max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+        bounds.x_max = min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X);
+    }
     return bounds;
 }
 
@@ -84,19 +97,37 @@ static __global__ void conv2d_kernel(const float * __restrict__ input,
     Layout::unpack_indices(global_idx, P, n, c_out, out_y, out_x);
 
     float acc = 0.0f;
+    if (P.CIRCULAR == 0) {
+        for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
+            kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
+
+            for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
+                const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
 
-    for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
-        kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
+                for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
+                    const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
+
+                    const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
+                    const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
+                    acc += (input_val * ggml_cuda_cast<float>(kernel_val));
+                }
+            }
+        }
+    }
+    else {
+        for (int64_t c_in = 0; c_in < P.IC; ++c_in) {
+            kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P);
 
-        for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
-            const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y);
+            for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) {
+                const int64_t in_y = wrap_coord(calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y), P.IH);
 
-            for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
-                const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
+                for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) {
+                    const int64_t in_x = wrap_coord(calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X), P.IW);
 
-                const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
-                const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
-                acc += (input_val * ggml_cuda_cast<float>(kernel_val));
+                    const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
+                    const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
+                    acc += (input_val * ggml_cuda_cast<float>(kernel_val));
+                }
             }
         }
     }
@@ -141,6 +172,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int       PD_Y = p[3];  // padding_y
     const int       DL_X = p[4];  // dilation_x
     const int       DL_Y = p[5];  // dilation_y
+    const int   CIRCULAR = p[6];
 
     // No cwhn
     GGML_ASSERT(p[6] == false);
@@ -156,7 +188,7 @@ void ggml_cuda_op_conv2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int B  = input->ne[3];   // n_batches
 
     const int64_t total  = B * OC * OH * OW;
-    conv_params   params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total };
+    conv_params   params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total, CIRCULAR };
 
     if (kernel->type == GGML_TYPE_F16) {
         conv2d_cuda_f16(X_D, (half *) K_D, Y_D, params, st);
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu