vulkan: optimizations for direct convolution

jeffbolznv · jeffbolznv · commit 9c12ef791512 · 2025-07-28T21:17:31.000-05:00
- Empirically choose a better tile size. Reducing BS_K/BS_NPQ helps fill
  the GPU. The new size should be amenable to using coopmat, too.
- Fix shmem bank conflicts. 16B padding should work with coopmat.
- Some explicit loop unrolling.
- Skip math/stores work for parts of the tile that are OOB.
- Apply fastdiv opt.
- Disable shuffles for NV.
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -908,8 +908,22 @@ struct vk_op_conv2d_push_constants {
     uint32_t nb1;
     uint32_t nb2;
     uint32_t nb3;
+
+    // init_fastdiv_values constants for dividing by KW, KW*KH, OW, OW*OH
+    uint32_t KWmp;   uint32_t KWL;
+    uint32_t KWKHmp; uint32_t KWKHL;
+    uint32_t OWmp;   uint32_t OWL;
+    uint32_t OWOHmp; uint32_t OWOHL;
 };
 
+template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
+    // Compute magic values to divide by KW, KW*KH, OW, OW*OH
+    init_fastdiv_values(p.KW,       p.KWmp,    p.KWL);
+    init_fastdiv_values(p.KW*p.KH,  p.KWKHmp,  p.KWKHL);
+    init_fastdiv_values(p.OW,       p.OWmp,    p.OWL);
+    init_fastdiv_values(p.OW*p.OH,  p.OWOHmp,  p.OWOHL);
+}
+
 struct vk_op_conv2d_dw_push_constants {
     uint32_t ne;
     uint32_t batches;
@@ -3052,17 +3066,28 @@ static void ggml_vk_load_shaders(vk_device& device) {
     uint32_t conv2d_BS_K     = 128;
     uint32_t conv2d_BS_CRS   = 16;
     uint32_t use_collectives = 0;  // Enables subgroup ops for preventing the re-calculation of indices.
+    uint32_t conv2d_BS_NPQ = 128;
+    uint32_t conv2d_TS_K   = 8;
+    uint32_t conv2d_SHMEM_PAD = 4;
+
+    if (device->vendor_id == VK_VENDOR_ID_NVIDIA) {
+        conv2d_BS_K     = 64;
+        conv2d_BS_CRS   = 32;
+        conv2d_BS_NPQ   = 32;
+        conv2d_TS_K     = 4;
+    }
+
     if (device->subgroup_shuffle &&
-        device->vendor_id != VK_VENDOR_ID_INTEL) {  // Do not enable collectives on Intel, see PR 14316
+        device->vendor_id != VK_VENDOR_ID_INTEL &&   // Do not enable collectives on Intel, see PR 14316.
+        device->vendor_id != VK_VENDOR_ID_NVIDIA) {  // Collectives no faster on NVIDIA.
         use_collectives = 1;
         conv2d_BS_CRS   = std::min(
             device->subgroup_size,
-            conv2d_BS_CRS);  // CRS block size should be capped at sugroup size for correctness when shuffle is used.
+            conv2d_BS_CRS);  // CRS block size should be capped at subgroup size for correctness when shuffle is used.
     }
-    uint32_t conv2d_BS_NPQ = 128;
-    uint32_t conv2d_TS_K   = 8;
+
     uint32_t conv2d_shmem_req =
-        (conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float);
+        (conv2d_BS_K * (conv2d_BS_CRS + conv2d_SHMEM_PAD) + conv2d_BS_CRS * (conv2d_BS_NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
     if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
         conv2d_BS_CRS = 8;
         if (use_collectives) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -1,13 +1,15 @@
 #version 450
 
+#extension GL_EXT_control_flow_attributes : enable
+
 #ifdef USE_COLLECTIVES
 #    extension GL_KHR_shader_subgroup_shuffle : enable
 #endif
 
 #include "types.comp"
 
 // Make spec constant
-#define SHMEM_PAD 0
+#define SHMEM_PAD 4
 
 // shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
 layout(binding = 0) readonly buffer A {
@@ -56,6 +58,12 @@ layout(push_constant) uniform parameter {
     uint32_t nb1;
     uint32_t nb2;
     uint32_t nb3;
+
+    // fastdiv helper values
+    uint32_t KWmp;   uint32_t KWL;
+    uint32_t KWKHmp; uint32_t KWKHL;
+    uint32_t OWmp;   uint32_t OWL;
+    uint32_t OWOHmp; uint32_t OWOHL;
 }
 
 p;
@@ -131,6 +139,14 @@ uint32_t       Br    = tid / BS_NPQ;
 uint32_t       Bc    = tid % BS_NPQ;
 const uint32_t BrpWg = WG_SIZE / BS_NPQ;
 
+// see init_fastdiv_values in ggml-vulkan.cpp
+uint fastdiv(uint n, uint mp, uint L) {
+    uint msbs, lsbs;
+    // msbs = mulhi(n, mp)
+    umulExtended(n, mp, msbs, lsbs);
+    return (msbs + n) >> L;
+}
+
 void main() {
     for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
         for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
@@ -151,9 +167,9 @@ void main() {
         uint32_t cached_KW_idx;
         if (use_collectives == 1) {
             cached_CRS_idx                = B_idx_CRS * BS_CRS + gl_SubgroupInvocationID;
-            cached_Cin_idx                = cached_CRS_idx / (p.KW * p.KH);
+            cached_Cin_idx                = fastdiv(cached_CRS_idx, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
             uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx * p.KW * p.KH);
-            cached_KH_idx                 = cached_CRS_remainder / p.KW;
+            cached_KH_idx                 = fastdiv(cached_CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
             cached_KW_idx                 = cached_CRS_remainder - cached_KH_idx * p.KW;
 
             CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac);
@@ -162,16 +178,16 @@ void main() {
             KW_idx_a  = subgroupShuffle(cached_KW_idx, Ac);
         } else {
             CRS_idx_a              = B_idx_CRS * BS_CRS + Ac;  // Global CRS_idx_a (column index of A)
-            Cin_idx_a              = CRS_idx_a / (p.KW * p.KH);
+            Cin_idx_a              = fastdiv(CRS_idx_a, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
             uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
-            KH_idx_a               = CRS_remainder / p.KW;
+            KH_idx_a               = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
             KW_idx_a               = CRS_remainder - KH_idx_a * p.KW;
         }
 #else
         CRS_idx_a     = B_idx_CRS * BS_CRS + Ac;  // Global CRS_idx_a (column index of A)
-        Cin_idx_a     = CRS_idx_a / (p.KW * p.KH);
+        Cin_idx_a     = fastdiv(CRS_idx_a, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH); / (p.KW * p.KH);
         CRS_remainder = CRS_idx_a - Cin_idx_a * p.KW * p.KH;
-        KH_idx_a      = CRS_remainder / p.KW;
+        KH_idx_a      = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
         KW_idx_a      = CRS_remainder - KH_idx_a * p.KW;
 #endif
 
@@ -188,13 +204,13 @@ void main() {
             Ash[B_ly * Ash_stride + B_lx] = val;
         }
         /* Load input to B_block: (BS_CRS x BS_NPQ) */
-        for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
+        [[unroll]] for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
             uint32_t B_ly          = r_offset + Br;             /* Row index of B block */
             uint32_t B_lx          = Bc;
             uint32_t NPQ_idx       = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */
-            uint32_t N_idx         = NPQ_idx / (p.OH * p.OW);
+            uint32_t N_idx         = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
             uint32_t NPQ_remainder = NPQ_idx - N_idx * p.OH * p.OW;
-            uint32_t OH_idx        = NPQ_remainder / p.OW;
+            uint32_t OH_idx        = fastdiv(NPQ_remainder, p.OWmp, p.OWL); // divide by p.OW;
             uint32_t OW_idx        = NPQ_remainder - OH_idx * p.OW;
 
             uint32_t CRS_idx_b;
@@ -209,16 +225,16 @@ void main() {
                 KW_idx_b  = subgroupShuffle(cached_KW_idx, r_offset + Br);
             } else {
                 CRS_idx_b              = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
-                Cin_idx_b              = CRS_idx_b / (p.KW * p.KH);
+                Cin_idx_b              = fastdiv(CRS_idx_b, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
                 uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
-                KH_idx_b               = CRS_remainder / p.KW;
+                KH_idx_b               = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
                 KW_idx_b               = CRS_remainder - KH_idx_b * p.KW;
             }
 #else
             CRS_idx_b              = B_idx_CRS * BS_CRS + B_ly; /* Global CRS index (row index of B) */
-            Cin_idx_b              = CRS_idx_b / (p.KW * p.KH);
+            Cin_idx_b              = fastdiv(CRS_idx_b, p.KWKHmp, p.KWKHL); // divide by (p.KW * p.KH);
             uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b * p.KW * p.KH;
-            KH_idx_b               = CRS_remainder / p.KW;
+            KH_idx_b               = fastdiv(CRS_remainder, p.KWmp, p.KWL); // divide by p.KW;
             KW_idx_b               = CRS_remainder - KH_idx_b * p.KW;
 #endif
 
@@ -233,32 +249,36 @@ void main() {
             Bsh[B_ly * Bsh_stride + B_lx] = val;
         }
         barrier();
-        for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
-            for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
-                regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
-            }
-            for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
-                regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
-            }
-            for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
+        if (T_y * TS_K < K) {
+            [[unroll]] for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
+                for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
+                    regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
+                }
                 for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
-                    regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
+                    regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
+                }
+                for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
+                    for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
+                        regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
+                    }
                 }
             }
         }
         barrier();
     }
     /* Save C* */
-    for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
-        for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
-            uint32_t K_idx   = B_idx_K * BS_K + T_y * TS_K + T_ly;
-            uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
-            uint32_t N_idx   = NPQ_idx / (p.OH * p.OW);
-            uint32_t OH_idx  = (NPQ_idx - N_idx * p.OH * p.OW) / p.OW;
-            uint32_t OW_idx  = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
-            uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
-            if (K_idx < K && NPQ_idx < NPQ) {
-                dst_data[dst_idx] = regC[T_ly][T_lx];
+    if (T_y * TS_K < K) {
+        for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
+            for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
+                uint32_t K_idx   = B_idx_K * BS_K + T_y * TS_K + T_ly;
+                uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
+                uint32_t N_idx   = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
+                uint32_t OH_idx  = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
+                uint32_t OW_idx  = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
+                uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
+                if (K_idx < K && NPQ_idx < NPQ) {
+                    dst_data[dst_idx] = regC[T_ly][T_lx];
+                }
             }
         }
     }