@@ -664,6 +664,69 @@ __device__ __forceinline__ float vec_dot_iq3_k_q8_1(
664664
665665}
666666
667+ __device__ __forceinline__ float vec_dot_iq3_ks_q8_1 (
668+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iiqs) {
669+
670+ const float d = *(const float *)vbq;
671+ const block_iq3_ks * bq3 = (const block_iq3_ks *)((const char *)vbq + sizeof (float )) + kbx;
672+
673+ int iqs = iiqs/4 ;
674+ const int ib128 = iqs/4 ; // 0 or 1. 0 works on quants 0...127, 1 on quants 128...255
675+ // Each thread processes 8 quants in each of the 4 32-blocks
676+ const int il8 = iqs%4 ; // 0...3. 0 works on quants 0...7, 1 on quants 8...15, 2 on 16...23, 3 on 24...31
677+
678+ const uint32_t * ql = (const uint32_t *)bq3->qs + 8 *ib128 + 2 *il8;
679+ const uint32_t * qh = (const uint32_t *)bq3->qh + 2 *il8;
680+
681+ uint32_t aux32;
682+ const uint8_t * aux8 = (const uint8_t *)&aux32;
683+
684+ const int hshift = 4 *(1 -ib128);
685+
686+ const uint16_t * values1 = iq3k_table + ((bq3->scales [4 *ib128+0 ] << 6 ) & 0x40 );
687+ const uint16_t * values2 = iq3k_table + ((bq3->scales [4 *ib128+1 ] << 6 ) & 0x40 );
688+ const uint16_t * values3 = iq3k_table + ((bq3->scales [4 *ib128+2 ] << 6 ) & 0x40 );
689+ const uint16_t * values4 = iq3k_table + ((bq3->scales [4 *ib128+3 ] << 6 ) & 0x40 );
690+
691+ const int * q8;
692+ int sumi[4 ] = {0 , 0 , 0 , 0 };
693+ int v;
694+ for (int i = 0 ; i < 2 ; ++i) {
695+ uint32_t vl = ql[i];
696+ uint32_t vh = (qh[i] << hshift) >> 2 ;
697+
698+ q8 = (const int *)bq8_1[4 *ib128+0 ].qs + 2 *il8;
699+ aux32 = (vl & 0x03030303 ) | (vh & 0x04040404 );
700+ v = int_from_table_2 (aux8, values1);
701+ sumi[0 ] = ggml_cuda_dp4a (v, q8[i], sumi[0 ]);
702+ vl >>= 2 ; vh >>= 1 ;
703+
704+ q8 += sizeof (block_q8_1)/4 ;
705+ aux32 = (vl & 0x03030303 ) | (vh & 0x04040404 );
706+ v = int_from_table_2 (aux8, values2);
707+ sumi[1 ] = ggml_cuda_dp4a (v, q8[i], sumi[1 ]);
708+ vl >>= 2 ; vh >>= 1 ;
709+
710+ q8 += sizeof (block_q8_1)/4 ;
711+ aux32 = (vl & 0x03030303 ) | (vh & 0x04040404 );
712+ v = int_from_table_2 (aux8, values3);
713+ sumi[2 ] = ggml_cuda_dp4a (v, q8[i], sumi[2 ]);
714+ vl >>= 2 ; vh >>= 1 ;
715+
716+ q8 += sizeof (block_q8_1)/4 ;
717+ aux32 = (vl & 0x03030303 ) | (vh & 0x04040404 );
718+ v = int_from_table_2 (aux8, values4);
719+ sumi[3 ] = ggml_cuda_dp4a (v, q8[i], sumi[3 ]);
720+
721+ }
722+ aux32 = ((const uint32_t *)bq3->scales )[ib128] & 0xfefefefe ;
723+ return d * (__low2float (bq8_1[4 *ib128+0 ].ds ) * ((int )aux8[0 ] - 127 ) * sumi[0 ] +
724+ __low2float (bq8_1[4 *ib128+1 ].ds ) * ((int )aux8[1 ] - 127 ) * sumi[1 ] +
725+ __low2float (bq8_1[4 *ib128+2 ].ds ) * ((int )aux8[2 ] - 127 ) * sumi[2 ] +
726+ __low2float (bq8_1[4 *ib128+3 ].ds ) * ((int )aux8[3 ] - 127 ) * sumi[3 ]);
727+
728+ }
729+
667730/* __device__ __forceinline__ float vec_dot_iq1_bn_q8_1(
668731 const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
669732
@@ -780,6 +843,13 @@ void mul_mat_vec_iq3_k_q8_1_cuda(
780843 iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ3_K, VDR_IQ3_K_Q8_1_MMVQ, vec_dot_iq3_k_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
781844}
782845
846+ void mul_mat_vec_iq3_ks_q8_1_cuda (
847+ const void * vx, const void * vy, float * dst,
848+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
849+
850+ iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ3_KS, VDR_IQ3_K_Q8_1_MMVQ, vec_dot_iq3_ks_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
851+ }
852+
783853void mul_mat_vec_iq4_k_q8_1_cuda (
784854 const void * vx, const void * vy, float * dst,
785855 const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
0 commit comments