Skip to content

Commit c5b7162

Browse files
author
Italo Nicola
committed
(wip) Vulkan: Adreno Q6_K fix
1 parent 6b50346 commit c5b7162

File tree

5 files changed

+91
-25
lines changed

5 files changed

+91
-25
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -240,14 +240,14 @@ enum FaHeadSizes {
240240
};
241241

242242
// XXX: Use value queried from the driver
243-
#if 0
243+
#if 1
244244
const uint64_t MAX_ADDRESS_SPACE_SIZE = 1 << 27;
245245
const uint64_t MAX_ADDRESS_SPACE_SIZE_MUL_MAT = 1 << 27;
246246
const uint64_t MAX_ADDRESS_SPACE_SIZE_OUT_PROD = 1 << 27;
247247
#else
248-
const uint64_t MAX_ADDRESS_SPACE_SIZE = 1 << 27;
249-
const uint64_t MAX_ADDRESS_SPACE_SIZE_MUL_MAT = 1 << 27;
250-
const uint64_t MAX_ADDRESS_SPACE_SIZE_OUT_PROD = 1 << 27;
248+
const uint64_t MAX_ADDRESS_SPACE_SIZE = 1 << 26;
249+
const uint64_t MAX_ADDRESS_SPACE_SIZE_MUL_MAT = 1 << 26;
250+
const uint64_t MAX_ADDRESS_SPACE_SIZE_OUT_PROD = 1 << 26;
251251
#endif
252252

253253
static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
@@ -4415,11 +4415,13 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
44154415
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
44164416
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
44174417
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
4418-
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
4418+
#if 0
4419+
std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
44194420
for (auto& buffer : descriptor_buffer_infos) {
44204421
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
44214422
}
4422-
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
4423+
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
4424+
#endif
44234425
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
44244426
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
44254427

@@ -5439,6 +5441,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
54395441
d_sz * ne12 * ne13 >= tiling_threshold);
54405442
#endif
54415443

5444+
if (tiling_debug) {
5445+
fprintf(stderr, "tiling enabled ? %d (%lu > %lu ?)\n", do_tiling, x_sz * ne02 * ne03 + y_sz * ne12 * ne13 + d_sz * ne12 * ne13, tiling_threshold);
5446+
}
5447+
54425448
// XXX
54435449
bool do_splitting = false;
54445450
#if 0
@@ -5980,7 +5986,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
59805986

59815987
if (ne01 > max_groups_x) {
59825988
groups_z = 64;
5989+
//groups_z = 96;
59835990
groups_x = CEIL_DIV(groups_x, groups_z);
5991+
GGML_ASSERT(max_groups_x > groups_x);
59845992
}
59855993

59865994
// compute

ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#extension GL_EXT_control_flow_attributes : enable
2+
//#extension GL_EXT_integer_dot_product : require
23
#extension GL_EXT_shader_16bit_storage : require
34
#extension GL_EXT_shader_8bit_storage : require
45

@@ -10,7 +11,7 @@
1011

1112
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
1213
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
13-
#if !defined(DATA_A_Q8_0) && !defined(DATA_A_Q4_0) && !defined(DATA_A_Q4_1)
14+
#if !defined(DATA_A_Q8_0) && !defined(DATA_A_Q4_0) && !defined(DATA_A_Q4_1) && !defined(DATA_A_Q6_K)
1415
layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];};
1516
layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
1617
#endif
@@ -94,15 +95,15 @@ shared FLOAT_TYPE tmpsh[NUM_COLS][NUM_ROWS][BLOCK_SIZE];
9495

9596
void reduce_result(const in FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
9697
// sum up partial sums and write back result
97-
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
98-
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
98+
for (uint j = 0; j < NUM_COLS; ++j) {
99+
for (uint n = 0; n < num_rows; ++n) {
99100
tmpsh[j][n][tid] = temp[j][n];
100101
}
101102
}
102103
barrier();
103-
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
104+
for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
104105
if (tid < s) {
105-
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
106+
for (uint j = 0; j < NUM_COLS; ++j) {
106107
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
107108
tmpsh[j][n][tid] += tmpsh[j][n][tid + s];
108109
}
@@ -111,8 +112,8 @@ void reduce_result(const in FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32
111112
barrier();
112113
}
113114
if (tid == 0) {
114-
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
115-
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
115+
for (uint j = 0; j < NUM_COLS; ++j) {
116+
for (uint n = 0; n < num_rows; ++n) {
116117
data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(tmpsh[j][n][0]);
117118
}
118119
}

ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp

Lines changed: 64 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ uint csel = 0;
1414
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
1515
const uint y_idx = i * QUANT_K + y_offset;
1616

17-
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
17+
for (uint n = 0; n < num_rows; ++n) {
1818
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
1919
csel ^= 1;
2020

@@ -27,15 +27,39 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
2727
continue;
2828
}
2929

30-
const uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
31-
const uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
30+
#if 0
31+
const uint32_t ql0_u32 =
32+
uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) |
33+
(uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
34+
const uint32_t ql32_u32 =
35+
uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) |
36+
(uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
37+
const uint32_t qh_u32 =
38+
uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) |
39+
(uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
40+
#else
41+
const uint32_t ql0_u32 =
42+
uint32_t(data_a[ib0 + i].ql[ql_offset]) |
43+
(uint32_t(data_a[ib0 + i].ql[ql_offset + 1]) << 8) |
44+
(uint32_t(data_a[ib0 + i].ql[ql_offset + 2]) << 16) |
45+
(uint32_t(data_a[ib0 + i].ql[ql_offset + 3]) << 24);
46+
const uint32_t ql32_u32 =
47+
uint32_t(data_a[ib0 + i].ql[ql_offset + 32]) |
48+
(uint32_t(data_a[ib0 + i].ql[ql_offset + 33]) << 8) |
49+
(uint32_t(data_a[ib0 + i].ql[ql_offset + 34]) << 16) |
50+
(uint32_t(data_a[ib0 + i].ql[ql_offset + 35]) << 24);
51+
const uint32_t qh_u32 =
52+
uint32_t(data_a[ib0 + i].qh[qh_offset + 0]) |
53+
(uint32_t(data_a[ib0 + i].qh[qh_offset + 1]) << 8) |
54+
(uint32_t(data_a[ib0 + i].qh[qh_offset + 2]) << 16) |
55+
(uint32_t(data_a[ib0 + i].qh[qh_offset + 3]) << 24);
56+
#endif
3257

3358
const uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
3459
const uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
3560
const uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
3661
const uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
3762

38-
const uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
3963
const uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
4064
const uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
4165
const uint32_t qh4_u32 = (qh_u32 & 0x30303030);
@@ -46,10 +70,17 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
4670
const uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32;
4771
const uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
4872

73+
#if 0
4974
const vec4 q0 = vec4(unpack8(q0_u32)) - 32;
5075
const vec4 q1 = vec4(unpack8(q1_u32)) - 32;
5176
const vec4 q2 = vec4(unpack8(q2_u32)) - 32;
5277
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
78+
#else
79+
const vec4 q0 = vec4(float(q0_u32 & 0xFF), float((q0_u32 >> 8) & 0xFF), float((q0_u32 >> 16) & 0xFF), float(q0_u32 >> 24)) - 32;
80+
const vec4 q1 = vec4(float(q1_u32 & 0xFF), float((q1_u32 >> 8) & 0xFF), float((q1_u32 >> 16) & 0xFF), float(q1_u32 >> 24)) - 32;
81+
const vec4 q2 = vec4(float(q2_u32 & 0xFF), float((q2_u32 >> 8) & 0xFF), float((q2_u32 >> 16) & 0xFF), float(q2_u32 >> 24)) - 32;
82+
const vec4 q3 = vec4(float(q3_u32 & 0xFF), float((q3_u32 >> 8) & 0xFF), float((q3_u32 >> 16) & 0xFF), float(q3_u32 >> 24)) - 32;
83+
#endif
5384

5485
if (all_threads) {
5586
sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
@@ -58,14 +89,38 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
5889

5990
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
6091

61-
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
92+
for (uint j = 0; j < NUM_COLS; ++j) {
93+
94+
#if 0
6295
vec4 by0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 ]);
6396
vec4 by32 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 8]);
6497
vec4 by64 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16]);
6598
vec4 by96 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24]);
99+
#else
100+
vec4 by0 =
101+
vec4(data_b[(j*p.batch_stride_b + b_offset + y_idx) + 0],
102+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 1],
103+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 2],
104+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 3]);
105+
vec4 by32 =
106+
vec4(data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 8],
107+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 8 + 1],
108+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 8 + 2],
109+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 8 + 3]);
110+
vec4 by64 =
111+
vec4(data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 16],
112+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 16 + 1],
113+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 16 + 2],
114+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 16 + 3]);
115+
vec4 by96 =
116+
vec4(data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 24],
117+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 24 + 1],
118+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 24 + 2],
119+
data_b[(j*p.batch_stride_b + b_offset + y_idx) + 4 * 24 + 3]);
120+
#endif
66121

67122
FLOAT_TYPE sum[4] = {0, 0, 0, 0};
68-
[[unroll]] for (uint l = 0; l < 4; ++l) {
123+
for (uint l = 0; l < 4; ++l) {
69124
sum[0] = fma(FLOAT_TYPE(by0[l]), q0[l], sum[0]);
70125
sum[1] = fma(FLOAT_TYPE(by32[l]), q1[l], sum[1]);
71126
sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]);
@@ -99,16 +154,16 @@ void compute_outputs(const uint first_row, const uint num_rows) {
99154
const uint s_offset = 8*v_im + is;
100155
const uint y_offset = 128*v_im + l0;
101156

102-
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
103-
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
157+
for (uint j = 0; j < NUM_COLS; ++j) {
158+
for (uint i = 0; i < NUM_ROWS; ++i) {
104159
temp[j][i] = FLOAT_TYPE(0);
105160
}
106161
}
107162

108163
const uint nbr_par_th = num_blocks_per_row%it_size;
109164
const uint nbr_all_th = num_blocks_per_row - nbr_par_th;
110165
uint i0 = 0;
111-
[[unroll]] for (; i0 < nbr_all_th; i0 += it_size)
166+
for (; i0 < nbr_all_th; i0 += it_size)
112167
calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, true);
113168
calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, false);
114169

ggml/src/ggml-vulkan/vulkan-shaders/types.comp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ struct block_q6_K_packed16
347347
#if defined(DATA_A_Q6_K)
348348
#define QUANT_K QUANT_K_Q6_K
349349
#define A_TYPE block_q6_K
350-
#define A_TYPE_PACKED16 block_q6_K_packed16
350+
//#define A_TYPE_PACKED16 block_q6_K_packed16
351351
#endif
352352

353353
// IQuants

tests/test-backend-ops.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4927,7 +4927,7 @@ static const ggml_type all_types[] = {
49274927
GGML_TYPE_Q8_0,
49284928
// GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
49294929
// GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
4930-
// GGML_TYPE_Q6_K,
4930+
GGML_TYPE_Q6_K,
49314931
// GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
49324932
// GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
49334933
// GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
@@ -4950,7 +4950,7 @@ static const ggml_type other_types[] = {
49504950
GGML_TYPE_Q8_0,
49514951
// GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
49524952
// GGML_TYPE_Q5_K,
4953-
// GGML_TYPE_Q6_K,
4953+
GGML_TYPE_Q6_K,
49544954
// GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
49554955
// GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
49564956
// GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
@@ -5303,6 +5303,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
53035303
test_cases.emplace_back(new test_out_prod(GGML_TYPE_F32, GGML_TYPE_F32, 1024, 256, 4096*40, {1, 1}, {1, 1}));
53045304
test_cases.emplace_back(new test_out_prod(GGML_TYPE_Q8_0, GGML_TYPE_F32, 1024, 256, 4096*40, {1, 1}, {1, 1}));
53055305

5306+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q6_K, GGML_TYPE_F32, 151936, 1, 1024, {1, 1}, {1, 1}));
5307+
53065308
#if 0
53075309
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 2, 2, 32, {1, 1}, {1, 1}));
53085310
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 4, 4, 32, {1, 1}, {1, 1}));

0 commit comments

Comments
 (0)