@@ -2706,6 +2706,60 @@ void iqk_convert_iq3_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
27062706 }
27072707}
27082708
2709+ void iqk_convert_iq4_kss_q8_k_r8 (int n, const void * vx, size_t bx, void * vy, int nrc_x) {
2710+ GGML_ASSERT (n%QK_K == 0 );
2711+ GGML_ASSERT (nrc_x%8 == 0 );
2712+
2713+ int nb = n/QK_K;
2714+
2715+ const block_iq4_kss * x8[8 ];
2716+
2717+ block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
2718+
2719+ __m256i values[2 ];
2720+ {
2721+ auto v1 = _mm_loadu_si128 ((const __m128i *)iq4k_values+0 );
2722+ auto v2 = _mm_loadu_si128 ((const __m128i *)iq4k_values+1 );
2723+ values[0 ] = MM256_SET_M128I (v1, v1);
2724+ values[1 ] = MM256_SET_M128I (v2, v2);
2725+ }
2726+
2727+ float drow[8 ];
2728+ float dnew[8 ];
2729+ int16_t ls[16 ];
2730+
2731+ __m256i xv[8 ];
2732+ uint32_t block[8 ];
2733+
2734+ for (int ix = 0 ; ix < nrc_x; ix += 8 ) {
2735+ for (int k = 0 ; k < 8 ; ++k) {
2736+ const float * dptr = (const float *)((const char *)vx + (ix + k)*bx);
2737+ drow[k] = dptr[0 ];
2738+ x8[k] = (const block_iq4_kss *)(dptr + 1 );
2739+ }
2740+ auto vd = _mm256_loadu_ps (drow);
2741+ for (int i = 0 ; i < nb; ++i) {
2742+ for (int k = 0 ; k < 8 ; ++k) {
2743+ for (int ib32 = 0 ; ib32 < 8 ; ++ib32) {
2744+ auto val = _mm_loadu_si128 ((const __m128i *)x8[k][i].qs +ib32);
2745+ auto val_q = _mm_and_si128 (val, _mm_set1_epi32 (0xfffefffe ));
2746+ val_q = _mm_xor_si128 (val_q, _mm_srli_epi16 (val_q, 1 ));
2747+ xv[ib32] = _mm256_and_si256 (MM256_SET_M128I (_mm_srli_epi16 (val_q, 4 ), val_q), _mm256_set1_epi8 (0xf ));
2748+ auto q4 = x8[k][i].qs + 4 *ib32;
2749+ uint32_t s32 = (q4[0 ] & 0x00010001 ) | ((q4[1 ] & 0x00010001 ) << 2 ) | ((q4[2 ] & 0x00010001 ) << 4 ) | ((q4[3 ] & 0x00010001 ) << 6 );
2750+ uint8_t s8 = (s32 | (s32 >> 15 )) & 0xff ;
2751+ // auto val_s = _mm_madd_epi16(_mm_and_si128(val, _mm_set1_epi32(0x00010001)), _mm_set1_epi64x(0x0008000400020001));
2752+ ls[2 *ib32+0 ] = ls[2 *ib32+1 ] = ((s8 & 254 ) - 127 );
2753+ xv[ib32] = _mm256_shuffle_epi8 (values[s8 & 1 ], xv[ib32]);
2754+ }
2755+ dnew[k] = convert_to_q8_k_r8 (k, 1 .f /127 , xv, ls, block, y[i].qs );
2756+ }
2757+ _mm_storeu_si128 ((__m128i *)y[i].d , _mm256_cvtps_ph (_mm256_mul_ps (vd, _mm256_loadu_ps (dnew)), _MM_ROUND_NEAREST));
2758+ }
2759+ y += nb;
2760+ }
2761+ }
2762+
27092763void iqk_convert_iq4_ks_q8_k_r8 (int n, const void * vx, size_t bx, void * vy, int nrc_x) {
27102764 GGML_ASSERT (n%QK_K == 0 );
27112765 GGML_ASSERT (nrc_x%8 == 0 );
@@ -3132,6 +3186,7 @@ bool iqk_convert_iqk_quants_q80_r8(int type, int n, const void * vx, size_t bx,
31323186 case GGML_TYPE_IQ2_KL : iqk_convert_iq2_kl_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
31333187 case GGML_TYPE_IQ3_KS : iqk_convert_iq3_ks_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
31343188 case GGML_TYPE_IQ3_K : iqk_convert_iq3_k_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
3189+ case GGML_TYPE_IQ4_KSS: iqk_convert_iq4_kss_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
31353190 case GGML_TYPE_IQ4_KS : iqk_convert_iq4_ks_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
31363191 case GGML_TYPE_IQ4_K : iqk_convert_iq4_k_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
31373192 case GGML_TYPE_IQ5_KS : iqk_convert_iq5_ks_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
@@ -4718,6 +4773,57 @@ void iqk_convert_iq2_kl_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
47184773 }
47194774}
47204775
4776+ void iqk_convert_iq4_kss_q8_k_r8 (int n, const void * vx, size_t bx, void * vy, int nrc_x) {
4777+ GGML_ASSERT (n%QK_K == 0 );
4778+ GGML_ASSERT (nrc_x%8 == 0 );
4779+
4780+ int nb = n/QK_K;
4781+
4782+ const block_iq4_kss * x8[8 ];
4783+
4784+ block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
4785+
4786+ auto values = vld1q_s8_x2 (iq4k_values);
4787+
4788+ float drow[8 ];
4789+ float dnew[8 ];
4790+ int8_t ls[16 ];
4791+
4792+ int8x16x2_t xv[8 ];
4793+ uint32_t block[8 ];
4794+
4795+ for (int ix = 0 ; ix < nrc_x; ix += 8 ) {
4796+ for (int k = 0 ; k < 8 ; ++k) {
4797+ const float * dptr = (const float *)((const char *)vx + (ix + k)*bx);
4798+ drow[k] = dptr[0 ];
4799+ x8[k] = (const block_iq4_kss *)(dptr + 1 );
4800+ }
4801+ auto vd = vld1q_f32_x2 (drow);
4802+ for (int i = 0 ; i < nb; ++i) {
4803+ for (int k = 0 ; k < 8 ; ++k) {
4804+ for (int ib32 = 0 ; ib32 < 8 ; ++ib32) {
4805+ auto q4 = x8[k][i].qs + 4 *ib32;
4806+ uint32_t s32 = (q4[0 ] & 0x00010001 ) | ((q4[1 ] & 0x00010001 ) << 2 ) | ((q4[2 ] & 0x00010001 ) << 4 ) | ((q4[3 ] & 0x00010001 ) << 6 );
4807+ uint8_t s8 = (s32 | (s32 >> 15 )) & 0xff ;
4808+ ls[2 *ib32+0 ] = ls[2 *ib32+1 ] = ((s8 & 254 ) - 127 );
4809+ auto val16 = vandq_u16 (vld1q_u16 ((const uint16_t *)q4), vdupq_n_u16 (0xfffe ));
4810+ auto val8 = vreinterpretq_u8_u16 (veorq_u16 (val16, vshrq_n_u16 (val16, 1 )));
4811+ auto & block_values = values.val [s8 & 1 ];
4812+ xv[ib32].val [0 ] = vqtbl1q_s8 (block_values, vandq_u8 (val8, vdupq_n_u8 (0xf )));
4813+ xv[ib32].val [1 ] = vqtbl1q_s8 (block_values, vshrq_n_u8 (val8, 4 ));
4814+ }
4815+ dnew[k] = convert_to_q8_k_r8 (1 .f /127 , xv, ls, block, (uint32_t *)y[i].qs + k);
4816+ }
4817+ auto d = vld1q_f32_x2 (dnew);
4818+ d.val [0 ] = vmulq_f32 (d.val [0 ], vd.val [0 ]);
4819+ d.val [1 ] = vmulq_f32 (d.val [1 ], vd.val [1 ]);
4820+ vst1_f16 ((float16_t *)y[i].d + 0 , vcvt_f16_f32 (d.val [0 ]));
4821+ vst1_f16 ((float16_t *)y[i].d + 4 , vcvt_f16_f32 (d.val [1 ]));
4822+ }
4823+ y += nb;
4824+ }
4825+ }
4826+
47214827void iqk_convert_iq4_ks_q8_k_r8 (int n, const void * vx, size_t bx, void * vy, int nrc_x) {
47224828 GGML_ASSERT (n%QK_K == 0 );
47234829 GGML_ASSERT (nrc_x%8 == 0 );
@@ -5163,6 +5269,7 @@ bool iqk_convert_iqk_quants_q80_r8(int type, int n, const void * vx, size_t bx,
51635269 case GGML_TYPE_IQ2_KL : iqk_convert_iq2_kl_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
51645270 case GGML_TYPE_IQ3_KS : iqk_convert_iq3_ks_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
51655271 case GGML_TYPE_IQ3_K : iqk_convert_iq3_k_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
5272+ case GGML_TYPE_IQ4_KSS: iqk_convert_iq4_kss_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
51665273 case GGML_TYPE_IQ4_KS : iqk_convert_iq4_ks_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
51675274 case GGML_TYPE_IQ4_K : iqk_convert_iq4_k_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
51685275 case GGML_TYPE_IQ5_KS : iqk_convert_iq5_ks_q8_k_r8 (n, vx, bx, vy, nrc_x); break ;
0 commit comments