* Performance fixes: minimized branch divergence, uses collectives to

etasnadi · etasnadi · commit 0715985edfba · 2025-07-12T23:50:51.000+02:00
eliminate redundant calculation, macros removed.

* Kernel shared memory size check

* Updates test-backend-ops to support graphs for performance
  measurement.
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1011,29 +1011,35 @@ class vk_perf_logger {
     void print_timings() {
         if(timings.empty()){
             return;
-        }        
+        }
+        uint64_t total_all_op_times = 0;
         std::cerr << "----------------\nVulkan Timings:" << std::endl;
         for (const auto& t : timings) {
-            uint64_t total = 0;
+            uint64_t total_op_times = 0;
             for (const auto& time : t.second) {
-                total += time;
+                total_op_times += time;
             }
-            std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us";
+            std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0) << " us";
 
             // If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
             auto it = flops.find(t.first);
             if(it != flops.end() && (it->second).size() == t.second.size()){
-                uint64_t total_nflops = 0;
+                uint64_t total_op_flops = 0;
                 for(const auto& elem : it->second){
-                    total_nflops += elem;
+                    total_op_flops += elem;
                 }
-                std::cout << " (" << (double(total_nflops)/(1000.0*1000.0*1000.0)) / (double(total)/(1000.0*1000.0*1000.0)) << " GFLOPS/s)";
+                std::cerr << " (" << (double(total_op_flops)/(1000.0*1000.0*1000.0)) / (double(total_op_times)/(1000.0*1000.0*1000.0)) << " GFLOPS/s)";
             }
 
+            total_all_op_times += total_op_times;
 
             std::cerr << std::endl;
         }
 
+        if(timings.size() > 0){
+            std::cerr << "Total time: " << total_all_op_times/1000.0 << " us." << std::endl;
+        }
+
         timings.clear();
         flops.clear();        
     }
@@ -1072,6 +1078,7 @@ class vk_perf_logger {
             uint64_t size_K = Cin*KW*KH;
             uint64_t size_N = N*OW*OH;
             uint64_t n_flops = size_M*size_N*(size_K+(size_K-1));
+            name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + ", N=N*OW*OH=" + std::to_string(size_N);
             flops[name].push_back(n_flops);
             timings[name].push_back(time);
             return;
@@ -3026,7 +3033,18 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {128 /* equal to BS_K in the shader */, 128 /* equal to BS_NPQ in the shader */, 1}, {}, 1);
+    // conv2d
+    uint32_t conv2d_WG_SIZE = 256;
+    uint32_t conv2d_BS_K = 128;
+    uint32_t conv2d_BS_CRS = 16;
+    uint32_t conv2d_BS_NPQ = 128;
+    uint32_t conv2d_TS_K = 8;
+    uint32_t conv2d_shmem_req = (conv2d_BS_K*(conv2d_BS_CRS+1) + conv2d_BS_CRS*(conv2d_BS_NPQ+1))*sizeof(float);
+    if(device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req){
+        conv2d_BS_CRS = 8;
+        conv2d_TS_K = 8;
+    }    
+    ggml_vk_create_pipeline(device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3, sizeof(vk_op_conv2d_push_constants), {conv2d_BS_K, conv2d_BS_NPQ, 1}, {conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f32, "conv2d_dw_cwhn_f32", conv2d_dw_cwhn_f32_len, conv2d_dw_cwhn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@@ -10200,6 +10218,11 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false);
         if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
             total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
+        }else if(cgraph->nodes[i]->op == GGML_OP_CONV_2D){
+            // Return CRSxNPQxsizeof(*) to account as many bytes as mul_mat has in im2col->mul_mat mode.
+            auto CRS_size = cgraph->nodes[i]->src[0]->ne[0]*cgraph->nodes[i]->src[0]->ne[1]*cgraph->nodes[i]->src[0]->ne[2];
+            auto NPQ_size = cgraph->nodes[i]->ne[0]*cgraph->nodes[i]->ne[1]*cgraph->nodes[i]->ne[3];
+            total_mat_mul_bytes += NPQ_size*CRS_size*ggml_type_size(cgraph->nodes[i]->type);
         }
         i += ctx->num_additional_fused_ops;
         ctx->num_additional_fused_ops = 0;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -1,9 +1,16 @@
 #version 450
 
-#extension GL_EXT_control_flow_attributes : enable
+#define USE_COLLECTIVES
+
+#ifdef USE_COLLECTIVES
+#extension GL_KHR_shader_subgroup_shuffle: enable
+#endif
 
 #include "types.comp"
 
+// Make spec constant
+#define SHMEM_PAD 0
+
 // shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
 layout (binding = 0) readonly buffer A {A_TYPE knl_data[];};    // src0 - kernel:   [KW, KH, Cin, Cout] 
 layout (binding = 1) readonly buffer B {B_TYPE src_data[];};    // src1 - input:    [W, H, Cin, N] -- channel_first format
@@ -45,12 +52,16 @@ layout (push_constant) uniform parameter {
     uint32_t nb3;
 } p;
 
-#define WG_SIZE 256
-
-layout(local_size_x = WG_SIZE, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+// Blocktile sizes
+layout(constant_id = 1) const uint BS_K = 128;
+layout(constant_id = 2) const uint BS_CRS = 16;
+layout(constant_id = 3) const uint BS_NPQ = 128;
+// Thread-tile sizes
+layout(constant_id = 4) const uint TS_K = 8;
 
 uint32_t tid = gl_LocalInvocationID.x;
-const uint32_t bs = gl_WorkGroupSize.x;
+const uint32_t WG_SIZE = gl_WorkGroupSize.x;
 
 uint splitWork(uint work_size, uint block_size){
     return (block_size + work_size -1) / block_size;
@@ -62,16 +73,11 @@ uint32_t NPQ = p.N*p.OH*p.OW;
 
 uint32_t n_elems_out = K*NPQ;
 
-// Blocktile sizes
-const uint32_t BS_K = 128;
-const uint32_t BS_CRS = 16;
-const uint32_t BS_NPQ = 128;
-
 // Number of blocktiles per input
 uint32_t NB_CRS = splitWork(CRS, BS_CRS);
 
-const uint32_t Ash_stride = BS_CRS+1;
-const uint32_t Bsh_stride = BS_NPQ+1;
+const uint32_t Ash_stride = BS_CRS+SHMEM_PAD;
+const uint32_t Bsh_stride = BS_NPQ+SHMEM_PAD;
 
 const uint32_t Ash_numel = BS_K*BS_CRS;
 const uint32_t Bsh_numel = BS_CRS*BS_NPQ;
@@ -83,7 +89,6 @@ shared float Ash[Ash_len];  // K x CRS
 shared float Bsh[Bsh_len];  // CRS x NPQ
 
 // Threadtile sizes
-const uint32_t TS_K = 16;
 const uint32_t TS_NPQ = BS_K*BS_NPQ / WG_SIZE / TS_K;
 
 // Number of threadtiles per blocktile
@@ -111,134 +116,111 @@ uint32_t T_x = tid % NT_NPQ;
 
 uint32_t Ar = tid / BS_CRS;
 uint32_t Ac = tid % BS_CRS;
-uint32_t ArpWg = WG_SIZE / BS_CRS;
+const uint32_t ArpWg = WG_SIZE / BS_CRS;
 
 uint32_t Br = tid / BS_NPQ;
 uint32_t Bc = tid % BS_NPQ;
-uint32_t BrpWg = WG_SIZE / BS_NPQ;
+const uint32_t BrpWg = WG_SIZE / BS_NPQ;
 
-void initReg(){
+void main(){\
     for(uint32_t T_ly = 0; T_ly < TS_K; T_ly++){
         for(uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++){
             regC[T_ly][T_lx] = 0.0;
         }
     }
-}
-
-void outProdReg(){
-    for(uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++){
-        for(uint32_t T_ly = 0; T_ly < TS_K; T_ly++){
-            regA[T_ly] = Ash[(T_y*TS_K + T_ly)*Ash_stride + CRS_lidx];
+    /* Advance block in CRS dim */\
+    for(uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++){
+        #ifdef USE_COLLECTIVES
+        uint32_t cached_CRS_idx = B_idx_CRS*BS_CRS + gl_SubgroupInvocationID;
+        uint32_t cached_Cin_idx = cached_CRS_idx / (p.KW*p.KH);
+        uint32_t cached_CRS_remainder = (cached_CRS_idx - cached_Cin_idx*p.KW*p.KH);
+        uint32_t cached_KH_idx = cached_CRS_remainder / p.KW;
+        uint32_t cached_KW_idx = cached_CRS_remainder - cached_KH_idx*p.KW;
+
+        uint32_t CRS_idx_a = subgroupShuffle(cached_CRS_idx, Ac);
+        uint32_t Cin_idx_a = subgroupShuffle(cached_Cin_idx, Ac);
+        uint32_t KH_idx_a = subgroupShuffle(cached_KH_idx, Ac);
+        uint32_t KW_idx_a = subgroupShuffle(cached_KW_idx, Ac);
+        #else
+        uint32_t CRS_idx_a = B_idx_CRS*BS_CRS + Ac;          // Global CRS_idx_a (column index of A)
+        uint32_t Cin_idx_a = CRS_idx_a / (p.KW*p.KH);
+        uint32_t CRS_remainder = CRS_idx_a - Cin_idx_a*p.KW*p.KH;
+        uint32_t KH_idx_a = CRS_remainder / p.KW;
+        uint32_t KW_idx_a = CRS_remainder - KH_idx_a*p.KW;
+        #endif
+
+        /* Load kernel to A_block: (BS_K x BS_CRS)*/
+        for(uint32_t r_offset = 0; r_offset < BS_K; r_offset += ArpWg){
+            uint32_t B_ly = r_offset + Ar;
+            uint32_t B_lx = Ac;
+            uint32_t K_idx = B_idx_K*BS_K + B_ly;                /* Global K_idx (row index of A)*/
+            uint32_t knl_idx = min(KW_idx_a + KH_idx_a*p.nb01 + Cin_idx_a*p.nb02 + K_idx*p.nb03, K*CRS-1);
+            float val = knl_data[knl_idx];
+            if(K_idx >= K || CRS_idx_a >= CRS){
+                val = 0.0;
+            }
+            Ash[B_ly * Ash_stride + B_lx] = val;
         }
-        for(uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++){
-            regB[T_lx] = Bsh[CRS_lidx*Bsh_stride + T_x*TS_NPQ+T_lx];
+        /* Load input to B_block: (BS_CRS x BS_NPQ) */
+        for(uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg){
+            uint32_t B_ly = r_offset + Br;                      /* Row index of B block */
+            uint32_t B_lx = Bc;
+            uint32_t NPQ_idx = B_idx_NPQ*BS_NPQ + B_lx;         /* Global NPQ index (column index of B) */
+            uint32_t N_idx = NPQ_idx / (p.OH*p.OW);
+            uint32_t NPQ_remainder = NPQ_idx - N_idx*p.OH*p.OW;
+            uint32_t OH_idx = NPQ_remainder / p.OW;
+            uint32_t OW_idx = NPQ_remainder - OH_idx*p.OW;
+            
+            #ifdef USE_COLLECTIVES
+            uint32_t CRS_idx_b = subgroupShuffle(cached_CRS_idx, r_offset + Br);
+            uint32_t Cin_idx_b = subgroupShuffle(cached_Cin_idx, r_offset + Br);
+            uint32_t KH_idx_b = subgroupShuffle(cached_KH_idx, r_offset + Br);
+            uint32_t KW_idx_b = subgroupShuffle(cached_KW_idx, r_offset + Br);
+            #else
+            uint32_t CRS_idx_b = B_idx_CRS*BS_CRS + B_ly;         /* Global CRS index (row index of B) */
+            uint32_t Cin_idx_b = CRS_idx_b / (p.KW*p.KH);
+            uint32_t CRS_remainder = CRS_idx_b - Cin_idx_b*p.KW*p.KH;
+            uint32_t KH_idx_b = CRS_remainder / p.KW;
+            uint32_t KW_idx_b = CRS_remainder - KH_idx_b*p.KW;
+            #endif
+            
+            uint32_t H_idx = OH_idx*p.s1 + KH_idx_b*p.d1 - p.p1;
+            uint32_t W_idx = OW_idx*p.s0 + KW_idx_b*p.d0 - p.p0;
+            uint32_t src_idx = min(max(W_idx + H_idx*p.nb11 + Cin_idx_b*p.nb12 + N_idx*p.nb13, 0), p.Cin*p.N*p.W*p.H-1);
+            float val = src_data[src_idx];
+            if(CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W){
+                val = 0.0;
+            }
+            Bsh[B_ly * Bsh_stride + B_lx] = val;
         }
-        for(uint32_t T_ly = 0; T_ly < TS_K; T_ly++){
+        barrier();
+        for(uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++){
+            for(uint32_t T_ly = 0; T_ly < TS_K; T_ly++){
+                regA[T_ly] = Ash[(T_y*TS_K + T_ly)*Ash_stride + CRS_lidx];
+            }
             for(uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++){
-                regC[T_ly][T_lx] += regA[T_ly] * regB[T_lx];
+                regB[T_lx] = Bsh[CRS_lidx*Bsh_stride + T_x*TS_NPQ+T_lx];
+            }
+            for(uint32_t T_ly = 0; T_ly < TS_K; T_ly++){
+                for(uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++){
+                    regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
+                }
             }
         }
+        barrier();
     }
-}
-
-// Generate different functions for computing the sides.
-
-#define NOOP()
-
-#define DEF_BOUNDARY_CONDITION_A_IF()\
-if(K_idx < K && CRS_idx < CRS){
-
-#define DEF_BOUNDARY_CONDITION_A_ELSE()\
-}else{\
-    Ash[B_ly * Ash_stride + B_lx] = 0.0;\
-}
-
-#define DEF_BOUNDARY_CONDITION_B_IF()\
-if(CRS_idx < CRS && NPQ_idx < NPQ){
-
-#define DEF_BOUNDARY_CONDITION_B_ELSE()\
-}else{\
-    Bsh[B_ly * Bsh_stride + B_lx] = 0.0;\
-}
-
-#define MAIN_LOOP(FUNC_NAME_SUFFIX, BOUNDARY_CONDITION_A_IF, BOUNDARY_CONDITION_A_ELSE, BOUNDARY_CONDITION_B_IF, BOUNDARY_CONDITION_B_ELSE)\
-void mainLoop ## FUNC_NAME_SUFFIX(){\
-    initReg();\
-    /* Advance block in CRS dim */\
-    for(uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++){\
-        /* Load kernel to A_block: (BS_K x BS_CRS)*/\
-        for(uint32_t r_offset = 0; r_offset < BS_K; r_offset += ArpWg){\
-            uint32_t B_ly = r_offset + Ar;\
-            uint32_t B_lx = Ac;\
-            uint32_t K_idx = B_idx_K*BS_K + B_ly;                /* Global K_idx (row index of A)*/\
-            uint32_t CRS_idx = B_idx_CRS*BS_CRS + B_lx;          /* Global CRS_idx (column index of A)*/\
-            BOUNDARY_CONDITION_A_IF()\
-                uint32_t Cin_idx = CRS_idx / (p.KW*p.KH);\
-                uint32_t KH_idx = (CRS_idx - Cin_idx*p.KW*p.KH) / p.KW;\
-                uint32_t KW_idx = CRS_idx - Cin_idx*p.KW*p.KH - KH_idx*p.KW;\
-                uint32_t knl_idx = KW_idx + KH_idx*p.nb01 + Cin_idx*p.nb02 + K_idx*p.nb03;\
-                Ash[B_ly * Ash_stride + B_lx] = knl_data[knl_idx];\
-            BOUNDARY_CONDITION_A_ELSE()\
-        }\
-        barrier();\
-        /* Load input to B_block: (BS_CRS x BS_NPQ) */\
-        for(uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg){\
-            uint32_t B_ly = r_offset + Br;                      /* Row index of B block */\
-            uint32_t B_lx = Bc;                                 /* Column index of B block */\
-            uint32_t CRS_idx = B_idx_CRS*BS_CRS + B_ly;         /* Global CRS index (row index of B) */\
-            uint32_t NPQ_idx = B_idx_NPQ*BS_NPQ + B_lx;         /* Global NPQ index (column index of B) */\
-            BOUNDARY_CONDITION_B_IF()\
-                uint32_t Cin_idx = CRS_idx / (p.KW*p.KH);\
-                uint32_t KH_idx = (CRS_idx - Cin_idx*p.KW*p.KH) / p.KW;\
-                uint32_t KW_idx = CRS_idx - Cin_idx*p.KW*p.KH - KH_idx*p.KW;\
-                uint32_t N_idx = NPQ_idx / (p.OH*p.OW);\
-                uint32_t OH_idx = (NPQ_idx - N_idx*p.OH*p.OW) / p.OW;\
-                uint32_t OW_idx = NPQ_idx - N_idx*p.OH*p.OW - OH_idx*p.OW;\
-                uint32_t H_idx = OH_idx*p.s1 + KH_idx*p.d1 - p.p1;\
-                uint32_t W_idx = OW_idx*p.s0 + KW_idx*p.d0 - p.p0;\
-                if(H_idx >= 0 && H_idx < p.H && W_idx >= 0 && W_idx < p.W){\
-                    uint32_t src_idx = W_idx + H_idx*p.nb11 + Cin_idx*p.nb12 + N_idx*p.nb13;\
-                    Bsh[B_ly * Bsh_stride + B_lx] = src_data[src_idx];\
-                }else{\
-                    Bsh[B_ly * Bsh_stride + B_lx] = 0.0;\
-                }\
-            BOUNDARY_CONDITION_B_ELSE()\
-        }\
-        barrier();\
-        outProdReg();\
-        barrier();\
-    }\
-    /* Save C* */\
-    for(uint32_t T_ly = 0; T_ly < TS_K; T_ly++){\
-        for(uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++){\
-            uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly;\
-            uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;\
-            if(K_idx < K && NPQ_idx < NPQ){\
-                uint32_t N_idx = NPQ_idx / (p.OH*p.OW);\
-                uint32_t OH_idx = (NPQ_idx - N_idx*p.OH*p.OW) / p.OW;\
-                uint32_t OW_idx = NPQ_idx - N_idx*p.OH*p.OW - OH_idx*p.OW;\
-                uint32_t dst_idx = OW_idx + OH_idx*p.nb1 + K_idx*p.nb2 + N_idx*p.nb3;\
-                dst_data[dst_idx] = regC[T_ly][T_lx];\
-            }\
-        }\
-    }\
-}
-
-// Generates mainLoopBoundaryCheck
-MAIN_LOOP(BoundaryCheck, 
-    DEF_BOUNDARY_CONDITION_A_IF, 
-    DEF_BOUNDARY_CONDITION_A_ELSE, 
-    DEF_BOUNDARY_CONDITION_B_IF, 
-    DEF_BOUNDARY_CONDITION_B_ELSE)
-
-// Generates mainLoopNoBoundaryCheck
-MAIN_LOOP(NoBoundaryCheck, 
-    NOOP, NOOP, NOOP, NOOP)
-
-void main(){
-    if(gl_WorkGroupID.x == gl_NumWorkGroups.x-1 || gl_WorkGroupID.y == gl_NumWorkGroups.y-1){
-        mainLoopBoundaryCheck();
-    }else{
-        mainLoopNoBoundaryCheck();
+    /* Save C* */
+    for(uint32_t T_ly = 0; T_ly < TS_K; T_ly++){
+        for(uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++){
+            uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly;
+            uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
+            uint32_t N_idx = NPQ_idx / (p.OH*p.OW);
+            uint32_t OH_idx = (NPQ_idx - N_idx*p.OH*p.OW) / p.OW;
+            uint32_t OW_idx = NPQ_idx - N_idx*p.OH*p.OW - OH_idx*p.OW;
+            uint32_t dst_idx = OW_idx + OH_idx*p.nb1 + K_idx*p.nb2 + N_idx*p.nb3;
+            if(K_idx < K && NPQ_idx < NPQ){
+                dst_data[dst_idx] = regC[T_ly][T_lx];
+            }
+        }
     }
 }