@@ -140,6 +140,18 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
140140}
141141#endif
142142
143+ #if defined(DATA_A_Q4_1)
144+ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
145+ const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
146+ return vec2(vui & 0xF, vui >> 4);
147+ }
148+ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
149+ const vec2 v01 = dequantize(ib, iqs, a_offset);
150+ const vec2 v23 = dequantize(ib, iqs + 1, a_offset);
151+ return vec4(v01.x, v01.y, v23.x, v23.y);
152+ }
153+ #endif
154+
143155void main() {
144156#ifdef NEEDS_INIT_IQ_SHMEM
145157 init_iq_shmem(gl_WorkGroupSize);
@@ -382,6 +394,7 @@ void main() {
382394 const uint ib = idx / 4;
383395 const uint iqs = idx & 0x03;
384396
397+ #if 0
385398 const float d = float(data_a_packed16[ib].d);
386399 const float m = float(data_a_packed16[ib].m);
387400 const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
@@ -396,6 +409,21 @@ void main() {
396409 buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
397410 buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
398411 buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
412+ #else
413+ const float d = float(data_a[ib].d);
414+ const float m = float(data_a[ib].m);
415+ const vec4 vxy = dequantize4(ib, 4*iqs, 0) * d + m;
416+ const vec4 vzw = dequantize4(ib, 4*iqs + 2, 0) * d + m;
417+
418+ buf_a[buf_idx ] = FLOAT_TYPE(vxy.x);
419+ buf_a[buf_idx + 1 ] = FLOAT_TYPE(vxy.z);
420+ buf_a[buf_idx + 2 ] = FLOAT_TYPE(vzw.x);
421+ buf_a[buf_idx + 3 ] = FLOAT_TYPE(vzw.z);
422+ buf_a[buf_idx + 16] = FLOAT_TYPE(vxy.y);
423+ buf_a[buf_idx + 17] = FLOAT_TYPE(vxy.w);
424+ buf_a[buf_idx + 18] = FLOAT_TYPE(vzw.y);
425+ buf_a[buf_idx + 19] = FLOAT_TYPE(vzw.w);
426+ #endif
399427#elif defined(DATA_A_Q5_0)
400428 const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
401429 const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
0 commit comments