@@ -128,6 +128,18 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
128128}
129129#endif
130130
131+ #if defined(DATA_A_Q4_0)
132+ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
133+ const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
134+ return (vec2(vui & 0xF, vui >> 4) - 8.0f);
135+ }
136+ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
137+ const vec2 v01 = dequantize(ib, iqs, a_offset);
138+ const vec2 v23 = dequantize(ib, iqs + 1, a_offset);
139+ return vec4(v01.x, v01.y, v23.x, v23.y);
140+ }
141+ #endif
142+
131143void main() {
132144#ifdef NEEDS_INIT_IQ_SHMEM
133145 init_iq_shmem(gl_WorkGroupSize);
@@ -335,6 +347,7 @@ void main() {
335347 const uint ib = idx / 4;
336348 const uint iqs = idx & 0x03;
337349
350+ #if 0
338351 const float d = float(data_a_packed16[ib].d);
339352 const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
340353 const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d;
@@ -348,6 +361,20 @@ void main() {
348361 buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
349362 buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
350363 buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
364+ #else
365+ const float d = float(data_a[ib].d);
366+ const vec4 vxy = dequantize4(ib, 4*iqs, 0) * d;
367+ const vec4 vzw = dequantize4(ib, 4*iqs + 2, 0) * d;
368+
369+ buf_a[buf_idx ] = FLOAT_TYPE(vxy.x);
370+ buf_a[buf_idx + 1 ] = FLOAT_TYPE(vxy.z);
371+ buf_a[buf_idx + 2 ] = FLOAT_TYPE(vzw.x);
372+ buf_a[buf_idx + 3 ] = FLOAT_TYPE(vzw.z);
373+ buf_a[buf_idx + 16] = FLOAT_TYPE(vxy.y);
374+ buf_a[buf_idx + 17] = FLOAT_TYPE(vxy.w);
375+ buf_a[buf_idx + 18] = FLOAT_TYPE(vzw.y);
376+ buf_a[buf_idx + 19] = FLOAT_TYPE(vzw.w);
377+ #endif
351378#elif defined(DATA_A_Q4_1)
352379 const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
353380 const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a;
0 commit comments