Feat: Added vulkan circular tiling support

Phylliida · Phylliida · commit f6ac08424f49 · 2025-11-03T13:27:40.000-08:00
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -1943,6 +1943,18 @@ extern "C" {
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
 
+
+    GGML_API struct ggml_tensor * ggml_conv_2d_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1
+
     GGML_API struct ggml_tensor * ggml_im2col_3d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -2016,6 +2028,19 @@ extern "C" {
             int                  d0,  // dilation dimension 0
             int                  d1); // dilation dimension 1
 
+
+    // depthwise (via im2col and mul_mat)
+    GGML_API struct ggml_tensor * ggml_conv_2d_dw_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                  s0,  // stride dimension 0
+            int                  s1,  // stride dimension 1
+            int                  p0,  // padding dimension 0
+            int                  p1,  // padding dimension 1
+            int                  d0,  // dilation dimension 0
+            int                  d1); // dilation dimension 1
+
     // Depthwise 2D convolution
     // may be faster than ggml_conv_2d_dw, but not available in all backends
     // a:   KW    KH    1    C    convolution kernel
@@ -2032,12 +2057,35 @@ extern "C" {
             int                   dilation0,
             int                   dilation1);
 
+    // Depthwise 2D convolution (on a torus)
+    // may be faster than ggml_conv_2d_dw, but not available in all backends
+    // a:   KW    KH    1    C    convolution kernel
+    // b:   W     H     C    N    input data
+    // res: W_out H_out C    N
+    GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   stride0,
+            int                   stride1,
+            int                   pad0,
+            int                   pad1,
+            int                   dilation0,
+            int                   dilation1);
+
     GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             int                   stride);
 
+    // circular (on a torus)
+    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   stride);
+
     GGML_API struct ggml_tensor * ggml_conv_2d_direct(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,   // convolution kernel [KW, KH, IC, OC]
@@ -2048,6 +2096,17 @@ extern "C" {
             int                   p1,  // padding dimension 1
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
+        
+    GGML_API struct ggml_tensor * ggml_conv_2d_direct_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel [KW, KH, IC, OC]
+            struct ggml_tensor  * b,   // input data [W, H, C, N]
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1
 
     GGML_API struct ggml_tensor * ggml_conv_3d_direct(
             struct ggml_context * ctx,
@@ -2156,6 +2215,15 @@ extern "C" {
             int                  p2,
             int                  p3);
 
+    // pad each dimension with values on the other side of the torus (looping around)
+    GGML_API struct ggml_tensor * ggml_pad_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  p0,
+            int                  p1,
+            int                  p2,
+            int                  p3);
+
     GGML_API struct ggml_tensor * ggml_pad_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -2169,6 +2237,20 @@ extern "C" {
             int                  rp3
             );
 
+    // circular padding
+    GGML_API struct ggml_tensor * ggml_pad_ext_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  lp0,
+            int                  rp0,
+            int                  lp1,
+            int                  rp1,
+            int                  lp2,
+            int                  rp2,
+            int                  lp3,
+            int                  rp3
+            );
+
     // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
     GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
             struct ggml_context * ctx,
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -940,6 +940,7 @@ struct vk_op_pad_push_constants {
     uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
     uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
     uint32_t misalign_offsets;
+    uint32_t circular;
 
     uint32_t lp0; uint32_t rp0;
     uint32_t lp1; uint32_t rp1;
@@ -982,6 +983,7 @@ static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor
     p.rp2 = dst->op_params[5];
     p.lp3 = dst->op_params[6];
     p.rp3 = dst->op_params[7];
+    p.circular = dst->op_params[8];
 
     return p; // fastdiv values and offsets are initialized later in ggml_vk_op
 }
@@ -1249,6 +1251,8 @@ struct vk_op_conv2d_push_constants {
     uint32_t KWKHmp; uint32_t KWKHL;
     uint32_t OWmp;   uint32_t OWL;
     uint32_t OWOHmp; uint32_t OWOHL;
+
+    uint32_t circular;
 };
 
 template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
@@ -1297,6 +1301,8 @@ struct vk_op_conv_transpose_2d_push_constants {
     uint32_t OWOHmp; uint32_t OWOHL;
     uint32_t s0mp; uint32_t s0L;
     uint32_t s1mp; uint32_t s1L;
+
+    uint32_t circular;
 };
 
 template <> void init_pushconst_fastdiv(vk_op_conv_transpose_2d_push_constants &p) {
@@ -1325,6 +1331,7 @@ struct vk_op_conv2d_dw_push_constants {
     int32_t pad_y;
     int32_t dilation_x;
     int32_t dilation_y;
+    uint32_t circular;
 };
 
 struct vk_op_upscale_push_constants {
@@ -10420,6 +10427,8 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
 
+    p.circular = static_cast<uint32_t>(dst->op_params[6]);
+
     GGML_ASSERT(ne03 == ne2);
     GGML_ASSERT(ne02 == ne12);
 
@@ -10469,6 +10478,8 @@ static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context
     p.nb2 = static_cast<uint32_t>(nb2 / nb0);
     p.nb3 = static_cast<uint32_t>(nb3 / nb0);
 
+    p.circular = static_cast<uint32_t>(dst->op_params[1]);
+
     GGML_ASSERT(ne02 == ne2);
     GGML_ASSERT(ne03 == ne12);
 
@@ -10492,6 +10503,7 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx
     p.pad_y = dst->op_params[3];
     p.dilation_x = dst->op_params[4];
     p.dilation_y = dst->op_params[5];
+    p.circular = dst->op_params[6];
 
     GGML_ASSERT(src0->ne[3] == p.channels);
     GGML_ASSERT(src1->ne[3] == p.batches);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp
@@ -19,6 +19,7 @@ layout (push_constant) uniform parameter
     int pad_y;
     int dilation_x;
     int dilation_y;
+    uint circular;
 } p;
 
 layout (binding = 0) readonly buffer A {A_TYPE knl_data[];};
@@ -27,6 +28,10 @@ layout (binding = 2) writeonly buffer D {D_TYPE dst_data[];};
 
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
+uint32_t wrap_coord(int coord, uint32_t size) {
+    return uint32_t((uint(coord + int(size))) % size);
+}
+
 FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
     uint i0 = idx / p.dst_w;
     uint dst_x = idx - i0 * p.dst_w;
@@ -39,19 +44,35 @@ FLOAT_TYPE conv_2d_dw_whcn(uint idx) {
     uint knl_i = c * p.knl_h * p.knl_w;
 
     FLOAT_TYPE sum = 0.0;
-    for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-        uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
-        if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
-            continue;
+
+    if (p.circular != 0u) {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
+            uint src_y  = wrap_coord(raw_y, p.src_h);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                int raw_x   = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x;
+                uint src_x  = wrap_coord(raw_x, p.src_w);
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
+                sum = fma(v, k, sum);
+            }
         }
-        for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-            uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
-            if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+    }
+    else {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+            if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
                 continue;
             }
-            FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
-            FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
-            sum = fma(v, k, sum);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+                    continue;
+                }
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * p.src_w + src_x]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[knl_i + knl_y * p.knl_w + knl_x]);
+                sum = fma(v, k, sum);
+            }
         }
     }
     return sum;
@@ -70,19 +91,34 @@ FLOAT_TYPE conv_2d_dw_cwhn(uint idx) {
     uint knl_row = p.knl_w * p.channels;
 
     FLOAT_TYPE sum = 0.0;
-    for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
-        uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
-        if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
-            continue;
+    if (p.circular != 0u) {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            int raw_y   = int(dst_y) * p.stride_y + int(knl_y) * p.dilation_y - p.pad_y;
+            uint src_y  = wrap_coord(raw_y, p.src_h);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                int raw_x   = int(dst_x) * p.stride_x + int(knl_x) * p.dilation_x - p.pad_x;
+                uint src_x  = wrap_coord(raw_x, p.src_w);
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
+                sum = fma(v, k, sum);
+            }
         }
-        for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
-            uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
-            if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+    }
+    else {
+        for (uint knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+            uint src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+            if (src_y >= p.src_h) { // src_y < 0 will wrap to a large unsigned int
                 continue;
             }
-            FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
-            FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
-            sum = fma(v, k, sum);
+            for (uint knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                uint src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                if (src_x >= p.src_w) { // src_x < 0 will wrap to a large unsigned int
+                    continue;
+                }
+                FLOAT_TYPE v = FLOAT_TYPE(src_data[src_i + src_y * src_row + src_x * p.channels + c]);
+                FLOAT_TYPE k = FLOAT_TYPE(knl_data[        knl_y * knl_row + knl_x * p.channels + c]);
+                sum = fma(v, k, sum);
+            }
         }
     }
     return sum;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -70,6 +70,8 @@ layout(push_constant) uniform parameter {
     uint32_t s0mp; uint32_t s0L;
     uint32_t s1mp; uint32_t s1L;
 #endif
+
+    uint32_t circular;
 }
 
 p;
@@ -174,6 +176,10 @@ ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_T
 }
 #endif
 
+uint32_t wrap_coord(int coord, uint32_t size) {
+    return uint32_t((uint(coord + int(size))) % size);
+}
+
 void main() {
 #ifdef COOPMAT2
     coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
@@ -274,7 +280,8 @@ void main() {
             KH_idx_b               = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
             KW_idx_b               = CRS_remainder - KH_idx_b * p.KW;
 #endif
-
+            uint32_t H_pos;
+            uint32_t W_pos;
 #ifdef TRANSPOSE
             uint32_t H_idx_x_s1 = OH_idx - KH_idx_b * p.d1 + p.p1;
             uint32_t W_idx_x_s0 = OW_idx - KW_idx_b * p.d0 + p.p0;
@@ -284,13 +291,15 @@ void main() {
             uint32_t H_idx = OH_idx * p.s1 + KH_idx_b * p.d1 - p.p1;
             uint32_t W_idx = OW_idx * p.s0 + KW_idx_b * p.d0 - p.p0;
 #endif
+            H_pos            = (p.circular != 0) ? wrap_coord(int(H_idx), p.H) : H_idx;
+            W_pos            = (p.circular != 0) ? wrap_coord(int(W_idx), p.W) : W_idx;
             uint32_t src_idx =
-                min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
+                min(max(W_pos + H_pos * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
             float val = src_data[src_idx];
             if (CRS_idx_b >= CRS || NPQ_idx >= NPQ
-                || H_idx >= p.H || W_idx >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
+                || H_pos >= p.H || W_pos >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
 #ifdef TRANSPOSE
-                || (H_idx_x_s1 - H_idx * p.s1 != 0) || (W_idx_x_s0 - W_idx * p.s0 != 0)
+                || (H_idx_x_s1 - H_pos * p.s1 != 0) || (W_idx_x_s0 - W_pos * p.s0 != 0)
 #endif
                 ) {
                 val = 0.0;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c