@@ -311,8 +311,8 @@ float16_t dequantFuncIQ1_S(const in decodeBufIQ1_S bl, const in uint blockCoords
311311    const float16_t d = bl.block.d;
312312    const uint idx = coordInBlock[1];
313313
314-     const uint ib32 = idx / 32 ;
315-     const uint ib8 = idx / 8 ;
314+     const uint ib32 = ( idx & 0xE0) >> 5 ;
315+     const uint ib8 = ( idx & 0xF8) >> 3 ;
316316
317317    const uint qh = bl.block.qh[ib32];
318318    const uint qs = bl.block.qs[ib8];
@@ -330,14 +330,20 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ1
330330   block_iq1_m block;
331331};
332332
333+ layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufIQ1_M_packed64 {
334+    block_iq1_m_packed64 block;
335+ };
336+ 
333337float16_t dequantFuncIQ1_M(const in decodeBufIQ1_M bl, const in uint blockCoords[2], const in uint coordInBlock[2])
334338{
335-     const u16vec4 scales = u16vec4(bl.block.scales[0], bl.block.scales[1], bl.block.scales[2], bl.block.scales[3]) >> 12;
336-     const float16_t d = uint16BitsToHalf(scales.x | (scales.y << 4) | (scales.z << 8) | (scales.w << 12));
339+     decodeBufIQ1_M_packed64 bl64 = decodeBufIQ1_M_packed64(bl);
337340    const uint idx = coordInBlock[1];
338341
339-     const uint ib8 = idx / 8;
340-     const uint ib16 = idx / 16;
342+     uvec2 scales = unpack32(bl64.block.scales);
343+     const float16_t d = uint16BitsToHalf(uint16_t(((scales.x & 0xF000) >> 12) | ((scales.x & 0xF0000000) >> 24) | ((scales.y & 0xF000) >> 4) | ((scales.y & 0xF0000000) >> 16)));
344+ 
345+     const uint ib8 = (idx & 0xF8) >> 3;
346+     const uint ib16 = (idx & 0xF0) >> 4;
341347    const int i8 = int(idx % 8);
342348    const uint sc = bl.block.scales[ib8 / 8];
343349    const uint qs = bl.block.qs[ib8];
0 commit comments