Skip to content

Commit 4ae3fc0

Browse files
committed
more barriers
1 parent ed1ad94 commit 4ae3fc0

File tree

3 files changed

+5
-0
lines changed

3 files changed

+5
-0
lines changed

ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
1616
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
1717
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
1818

19+
barrier();
1920
if (!all_threads) { // when we don't have enough blocks to use all threads
2021
if (i < num_blocks_per_row) {
2122
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);

ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
1616
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
1717

1818
if (!all_threads) { // when we don't have enough blocks to use all threads
19+
barrier();
1920
if (i < num_blocks_per_row)
2021
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
2122
barrier();
@@ -39,6 +40,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
3940
const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
4041

4142
if (all_threads) {
43+
barrier();
4244
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
4345
barrier();
4446
}

ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
1717
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
1818

1919
if (!all_threads) { // when we don't have enough blocks to use all threads
20+
barrier();
2021
if (i < num_blocks_per_row)
2122
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
2223
barrier();
@@ -50,6 +51,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
5051
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
5152

5253
if (all_threads) {
54+
barrier();
5355
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
5456
barrier();
5557
}

0 commit comments

Comments
 (0)