@@ -6144,70 +6144,70 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
61446144 float sumf = 0;
61456145 if (__riscv_vlenb() >= 32) {
61466146 for (int i = 0; i < nb; ++i) {
6147-
6147+
61486148 const uint8_t * GGML_RESTRICT q3 = x[i].qs;
61496149 const uint8_t * GGML_RESTRICT qh = x[i].hmask;
61506150 const int8_t * GGML_RESTRICT q8 = y[i].qs;
6151-
6151+
61526152 memcpy(aux, x[i].scales, 12);
61536153 utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
61546154 utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
61556155 utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
61566156 utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
6157-
6157+
61586158 int8_t * scale = (int8_t *)utmp;
61596159 for (int j = 0; j < 16; ++j) scale[j] -= 32;
6160-
6161-
6160+
6161+
61626162 size_t vl = 32;
61636163 uint8_t m = 1;
6164-
6164+
61656165 vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
61666166 vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
6167-
6167+
61686168 int sum_t = 0;
6169-
6169+
61706170 for (int j = 0; j < QK_K; j += 128) {
6171-
6171+
61726172 vl = 32;
6173-
6173+
61746174 // load Q3
61756175 vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
6176-
6176+
61776177 vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
61786178 vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
61796179 vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
61806180 vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
6181-
6181+
61826182 // compute mask for subtraction
61836183 vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
61846184 vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
61856185 vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
61866186 m <<= 1;
6187-
6187+
61886188 vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
61896189 vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
61906190 vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
61916191 m <<= 1;
6192-
6192+
61936193 vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
61946194 vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
61956195 vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
61966196 m <<= 1;
6197-
6197+
61986198 vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
61996199 vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
62006200 vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
62016201 m <<= 1;
6202-
6202+
62036203 // load Q8 and take product with Q3
62046204 vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
62056205 vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
62066206 vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
62076207 vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
6208-
6208+
62096209 vl = 16;
6210-
6210+
62116211 // retrieve lane to multiply with scale
62126212 vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
62136213 vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
@@ -6217,22 +6217,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
62176217 vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
62186218 vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
62196219 vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
6220-
6220+
62216221 vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
62226222 vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
62236223 vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
62246224 vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
6225-
6225+
62266226 sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
6227-
6227+
62286228 q3 += 32; q8 += 128; scale += 8;
6229-
6229+
62306230 }
6231-
6231+
62326232 const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
6233-
6233+
62346234 sumf += d*sum_t;
6235-
6235+
62366236 }
62376237 } else if (__riscv_vlenb() == 16) {
62386238 for (int i = 0; i < nb; ++i) {
@@ -7071,64 +7071,64 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
70717071 for (int i = 0; i < nb; ++i) {
70727072
70737073 size_t vl = 8;
7074-
7074+
70757075 const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
70767076 const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
7077-
7077+
70787078 vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
70797079 vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
70807080 vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
7081-
7081+
70827082 memcpy(utmp, x[i].scales, 12);
70837083 utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
70847084 const uint32_t uaux = utmp[1] & kmask1;
70857085 utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
70867086 utmp[2] = uaux;
70877087 utmp[0] &= kmask1;
7088-
7088+
70897089 vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
70907090 vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
70917091 vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
7092-
7092+
70937093 vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
70947094 sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
7095-
7095+
70967096 const uint8_t * GGML_RESTRICT q4 = x[i].qs;
70977097 const int8_t * GGML_RESTRICT q8 = y[i].qs;
7098-
7098+
70997099 vl = 32;
7100-
7100+
71017101 int32_t sum_1 = 0;
71027102 int32_t sum_2 = 0;
7103-
7103+
71047104 vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
7105-
7105+
71067106 for (int j = 0; j < QK_K/64; ++j) {
71077107 // load Q4
71087108 vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
7109-
7109+
71107110 // load Q8 and multiply it with lower Q4 nibble
71117111 vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
71127112 vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
71137113 vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
71147114 vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
7115-
7115+
71167116 sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
7117-
7117+
71187118 // load Q8 and multiply it with upper Q4 nibble
71197119 vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
71207120 vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
71217121 vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
71227122 vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
7123-
7123+
71247124 sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
7125-
7125+
71267126 q4 += 32; q8 += 64;
7127-
7127+
71287128 }
7129-
7129+
71307130 sumf += d*(sum_1 + sum_2);
7131-
7131+
71327132 }
71337133 } else if (__riscv_vlenb() == 16) {
71347134 for (int i = 0; i < nb; ++i) {
@@ -7180,13 +7180,13 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
71807180 , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
71817181 );
71827182 sumf -= dmin * sumi;
7183-
7183+
71847184 const uint8_t * restrict q4 = x[i].qs;
71857185 const int8_t * restrict q8 = y[i].qs;
7186-
7186+
71877187 sumi = 0;
71887188 const uint8_t * scale = scales;
7189-
7189+
71907190 for (int j = 0; j < QK_K/128; ++j) {
71917191 int vl128 = 128, vl64 = 64, vl32 = 32;
71927192 __asm__ __volatile__(
@@ -7230,7 +7230,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
72307230
72317231 q4 += 64; q8 += 128; scale += 4;
72327232 }
7233-
7233+
72347234 sumf += d * sumi;
72357235 }
72367236 }
@@ -8918,59 +8918,59 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
89188918 for (int i = 0; i < nb; ++i) {
89198919
89208920 const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8921-
8921+
89228922 const uint8_t * GGML_RESTRICT q6 = x[i].ql;
89238923 const uint8_t * GGML_RESTRICT qh = x[i].qh;
89248924 const int8_t * GGML_RESTRICT q8 = y[i].qs;
8925-
8925+
89268926 const int8_t * GGML_RESTRICT scale = x[i].scales;
8927-
8927+
89288928 size_t vl;
8929-
8929+
89308930 vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
8931-
8931+
89328932 int sum_t = 0;
89338933 int is = 0;
8934-
8934+
89358935 for (int j = 0; j < QK_K/128; ++j) {
8936-
8936+
89378937 vl = 32;
8938-
8938+
89398939 // load qh
89408940 vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
8941-
8941+
89428942 // load Q6
89438943 vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
89448944 vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
8945-
8945+
89468946 vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
89478947 vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
89488948 vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
89498949 vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
8950-
8950+
89518951 vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
89528952 vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
89538953 vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
89548954 vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
8955-
8955+
89568956 vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
89578957 vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
89588958 vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
89598959 vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
8960-
8960+
89618961 vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
89628962 vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
89638963 vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
89648964 vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
8965-
8965+
89668966 // load Q8 and take product
89678967 vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
89688968 vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
89698969 vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
89708970 vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
8971-
8971+
89728972 vl = 16;
8973-
8973+
89748974 vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
89758975 vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
89768976 vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
@@ -8979,35 +8979,35 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
89798979 vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
89808980 vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
89818981 vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
8982-
8982+
89838983 vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
89848984 vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
89858985 vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
89868986 vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
8987-
8987+
89888988 sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
8989-
8989+
89908990 q6 += 64; qh += 32; q8 += 128; is=8;
8991-
8991+
89928992 }
8993-
8993+
89948994 sumf += d * sum_t;
8995-
8995+
89968996 }
89978997 } else if (__riscv_vlenb() == 16) {
89988998 for (int i = 0; i < nb; ++i) {
8999-
8999+
90009000 const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9001-
9001+
90029002 const uint8_t * restrict q6 = x[i].ql;
90039003 const uint8_t * restrict qh = x[i].qh;
90049004 const int8_t * restrict q8 = y[i].qs;
9005-
9005+
90069006 const int8_t * restrict scale = x[i].scales;
9007-
9007+
90089008 int sum_t = 0;
90099009 int t0;
9010-
9010+
90119011 for (int j = 0; j < QK_K/128; ++j) {
90129012 __asm__ __volatile__(
90139013 "vsetvli zero, %[vl32], e8, m2\n\t"
@@ -9063,9 +9063,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
90639063 );
90649064 q6 += 64; qh += 32; q8 += 128; scale += 8;
90659065 }
9066-
9066+
90679067 sumf += d * sum_t;
9068-
9068+
90699069 }
90709070 }
90719071
0 commit comments