@@ -4635,7 +4635,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
46354635
46364636 svint32_t sumi1 = svdup_n_s32(0);
46374637
4638- for (int j = 0; j < QK_K/256; ++j) {
4638+ {
46394639 const svuint8_t q2bits_1 = svld1_u8(svptrue_b8(), q2);
46404640 svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_1, m3s));
46414641 svint8_t q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
@@ -4768,8 +4768,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
47684768 acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, temp, dmin_broad);
47694769
47704770 svint32_t sumi1 = svdup_n_s32(0);
4771-
4772- for (int j = 0; j < QK_K/256; ++j) {
4771+
4772+ {
47734773 const svuint8_t q2bits_1 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
47744774 svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_1, m3s));
47754775 svint8_t q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
@@ -4821,8 +4821,6 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
48214821
48224822 scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 6), svdup_lane_s32(scales_sv_1, 7));
48234823 sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
4824-
4825-
48264824 }
48274825 acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), sumi1), d_broad);
48284826 }
@@ -5513,6 +5511,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
55135511#if defined(__ARM_FEATURE_SVE)
55145512
55155513 uint32_t utmp[4];
5514+ //uint32_t aux[3];
55165515
55175516 const int8_t m32 = 32;
55185517 const int vector_length = svcntb()*8;
0 commit comments