more barriers

netrunnereve · netrunnereve · commit 4ae3fc01552c · 2025-01-12T17:21:57.000-05:00
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@@ -16,6 +16,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
     [[unroll]] for (uint n = 0; n < num_rows; ++n) {
         const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
 
+        barrier();
         if (!all_threads) { // when we don't have enough blocks to use all threads
             if (i < num_blocks_per_row) {
                 const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
@@ -16,6 +16,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
         const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
 
         if (!all_threads) { // when we don't have enough blocks to use all threads
+            barrier();
             if (i < num_blocks_per_row)
                 sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
             barrier();
@@ -39,6 +40,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
         const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
 
         if (all_threads) {
+            barrier();
             sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
             barrier();
         }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@@ -17,6 +17,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
         const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
 
         if (!all_threads) { // when we don't have enough blocks to use all threads
+            barrier();
             if (i < num_blocks_per_row)
                 sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
             barrier();
@@ -50,6 +51,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
         const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
 
         if (all_threads) {
+            barrier();
             sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
             barrier();
         }