@@ -194,6 +194,53 @@ FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
194194}
195195#endif
196196
197+ #if defined(DATA_A_Q3_K)
198+ // 2-byte loads for Q3_K blocks (110 bytes)
199+ i32vec2 repack2(uint ib, uint iqs) {
200+ const uint ib_k = ib / 8 ;
201+ const uint iqs_k = (ib % 8 ) * 8 + iqs;
202+
203+ const uint qs_idx = (iqs_k / 32 ) * 8 + (iqs_k % 8 );
204+ const uint qs_shift = ((iqs_k % 32 ) / 8 ) * 2 ;
205+ const uint hm_shift = iqs_k / 8 ;
206+
207+ // bitwise OR to add 4 if hmask is set, subtract later
208+ const i8vec2 vals00 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 ] >> qs_shift) & uint16_t(0x0303))) |
209+ unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 ] >> hm_shift) & uint16_t(0x0101)) << 2 ));
210+ const i8vec2 vals01 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 1 ] >> qs_shift) & uint16_t(0x0303))) |
211+ unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 1 ] >> hm_shift) & uint16_t(0x0101)) << 2 ));
212+ const i8vec2 vals10 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 2 ] >> qs_shift) & uint16_t(0x0303))) |
213+ unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 2 ] >> hm_shift) & uint16_t(0x0101)) << 2 ));
214+ const i8vec2 vals11 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 3 ] >> qs_shift) & uint16_t(0x0303))) |
215+ unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 3 ] >> hm_shift) & uint16_t(0x0101)) << 2 ));
216+
217+ return i32vec2(pack32(i8vec4(vals00.x, vals00.y, vals01.x, vals01.y) - int8_t(4 )),
218+ pack32(i8vec4(vals10.x, vals10.y, vals11.x, vals11.y) - int8_t(4 )));
219+ }
220+
221+ float get_d_scale(uint ib, uint iqs) {
222+ const uint ib_k = ib / 8 ;
223+ const uint iqs_k = (ib % 8 ) * 8 + iqs;
224+ const uint is = iqs_k / 4 ;
225+
226+ const int8_t scale = int8_t(((data_a[ib_k].scales[is % 8 ] >> (4 * (is / 8 ))) & 0x0F0F) |
227+ (((data_a[ib_k].scales[8 + (is % 4 )] >> (2 * (is / 4 ))) & 0x0303) << 4 ));
228+ return float (data_a[ib_k].d) * float (scale - 32 );
229+ }
230+
231+ FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
232+ int32_t q_sum = 0 ;
233+
234+ const i32vec2 qs_a = repack2(ib_a, iqs * 2 );
235+ const float d_scale = get_d_scale(ib_a, iqs * 2 );
236+
237+ q_sum += dotPacked4x8EXT(qs_a.x, cache_b_qs[0 ]);
238+ q_sum += dotPacked4x8EXT(qs_a.y, cache_b_qs[1 ]);
239+
240+ return FLOAT_TYPE(float (cache_b_ds.x) * d_scale * float (q_sum));
241+ }
242+ #endif
243+
197244#if defined(DATA_A_Q4_K) || defined(DATA_A_Q5_K)
198245// 4-byte loads for Q4_K blocks (144 bytes) and Q5_K blocks (176 bytes)
199246FLOAT_TYPE mul_q8_1(const int32_t q_sum, const vec2 dma, const vec2 dsb, const int32_t sum_divisor) {
0 commit comments