@@ -5098,10 +5098,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
50985098 const svint32_t vzero_sv = svdup_n_s32(0);
50995099
51005100 const svuint8_t m0_sv = svdup_n_u8(1);
5101- const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(),m0_sv,1);
5102- const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(),m0_sv,2);
5103- const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(),m0_sv,3);
5104- svbool_t pred_s32 = svnot_b_z (svptrue_b32(),svptrue_pat_b32(SV_VL4));
5101+ const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1);
5102+ const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2);
5103+ const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3);
5104+ svbool_t pred_s32 = svnot_b_z (svptrue_b32(), svptrue_pat_b32(SV_VL4));
51055105
51065106 float sum = 0;
51075107
@@ -5124,147 +5124,145 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
51245124
51255125 for (int j = 0; j < 16; ++j) scale[j] -= m32;
51265126
5127- switch(vector_length){
5127+ switch (vector_length) {
51285128 case 128:
51295129 {
5130- svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(),qh_sv);
5131- svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(),qh_sv+16);
5130+ svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(), qh_sv);
5131+ svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(), qh_sv+16);
51325132 svuint8_t q3h_sv;
51335133
51345134 svint32_t sumi1_1 = svdup_n_s32(0);
51355135 svint8_t q3bytes_sv;
51365136
51375137 for (int j = 0; j < QK_K/128; ++j) {
51385138
5139- const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(),q3_sv); q3_sv += 16;
5140- const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(),q3_sv); q3_sv += 16;
5141- svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5142- svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5139+ const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
5140+ const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
5141+ svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5142+ svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
51435143
5144- q3h_sv = svlsl_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m0_sv, qhbits_sv_1),2);
5145- q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),q3bits_sv,m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5144+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_1), 2);
5145+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51465146
5147- sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),svdup_n_s32((int32_t)scale[0]));
5147+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
51485148
5149- q3h_sv = svlsl_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m0_sv, qhbits_sv_2),2);
5150- q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),q3bits_sv_1,m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5149+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_2), 2);
5150+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv_1, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51515151
5152- sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),svdup_n_s32((int32_t)scale[1]));
5152+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
51535153
5154- q8bytes_1_sv_1 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5155- q8bytes_1_sv_2 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5154+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5155+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
51565156
5157- q3h_sv = svlsl_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m1_sv, qhbits_sv_1),1);
5158- q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv,2),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5157+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_1), 1);
5158+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51595159
5160- sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),svdup_n_s32((int32_t)scale[2]));
5160+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
51615161
5162- q3h_sv = svlsl_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m1_sv, qhbits_sv_2),1);
5163- q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv_1,2),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5162+ q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_2), 1);
5163+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51645164
5165- sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),svdup_n_s32((int32_t)scale[3]));
5165+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
51665166
51675167
51685168 scale += 4;
5169- q8bytes_1_sv_1 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5170- q8bytes_1_sv_2 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5169+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5170+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
51715171
5172- q3h_sv = svbic_u8_x(svptrue_b8(),m2_sv, qhbits_sv_1);
5173- q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv,4),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5172+ q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_1);
5173+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51745174
5175- sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),svdup_n_s32((int32_t)scale[0]));
5175+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
51765176
5177- q3h_sv = svbic_u8_x(svptrue_b8(),m2_sv, qhbits_sv_2);
5178- q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv_1,4),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5177+ q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_2);
5178+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51795179
5180- sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),svdup_n_s32((int32_t)scale[1]));
5180+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
51815181
51825182
5183- q8bytes_1_sv_1 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5184- q8bytes_1_sv_2 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5183+ q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5184+ q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
51855185
5186- q3h_sv = svlsr_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m3_sv, qhbits_sv_1),1);
5187- q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv,6),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5186+ q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_1), 1);
5187+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51885188
5189- sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),svdup_n_s32((int32_t)scale[2]));
5189+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
51905190
5191- q3h_sv = svlsr_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m3_sv, qhbits_sv_2),1);
5192- q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv_1,6),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5191+ q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_2), 1);
5192+ q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51935193
5194- sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),svdup_n_s32((int32_t)scale[3]));
5194+ sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
51955195
51965196
5197- if(j==0)
5198- {
5199- qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(),qhbits_sv_1,4);
5200- qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(),qhbits_sv_2,4);
5197+ if (j==0) {
5198+ qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_1, 4);
5199+ qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_2, 4);
52015200 }
52025201
52035202 scale += 4;
52045203
52055204 }
52065205
52075206 sum += d * (svaddv_s32(svptrue_b32(), sumi1_1));
5208- }break;
5207+ } break;
52095208 case 256:
52105209 case 512:
52115210 {
5212- svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32),qh_sv);
5211+ svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), qh_sv);
52135212 svuint8_t q3h_sv;
52145213
52155214 svint32_t sumi1_1 = svdup_n_s32(0);
52165215 svint8_t q3bytes_sv;
52175216
52185217 for (int j = 0; j < QK_K/128; ++j) {
52195218
5220- const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32),q3_sv); q3_sv += 32;
5221- svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32),q8_sv); q8_sv += 32;
5222- svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32),q8_sv); q8_sv += 32;
5219+ const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), q3_sv); q3_sv += 32;
5220+ svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5221+ svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
52235222
5224- q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32),svbic_u8_x(svptrue_pat_b8(SV_VL32),m0_sv, qhbits_sv),2);
5225- q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32),svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32),q3bits_sv,m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5223+ q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m0_sv, qhbits_sv), 2);
5224+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
52265225
52275226
5228- svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4),svdup_n_s32((int32_t)scale[0]),svdup_n_s32((int32_t)scale[1]));
5229- sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),scale_1);
5227+ svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
5228+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
52305229
5231- q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32),svbic_u8_x(svptrue_pat_b8(SV_VL32),m1_sv, qhbits_sv),1);
5232- q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32),svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32),svlsr_n_u8_x(svptrue_pat_b8(SV_VL32),q3bits_sv,2),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5230+ q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m1_sv, qhbits_sv), 1);
5231+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
52335232
5234- scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4),svdup_n_s32((int32_t)scale[2]),svdup_n_s32((int32_t)scale[3]));
5235- sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),scale_1);
5233+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
5234+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
52365235
52375236 scale += 4;
5238- q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32),q8_sv); q8_sv += 32;
5239- q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32),q8_sv); q8_sv += 32;
5237+ q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5238+ q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
52405239
5241- q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32),m2_sv, qhbits_sv);
5242- q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32),svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32),svlsr_n_u8_x(svptrue_pat_b8(SV_VL32),q3bits_sv,4),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5240+ q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32), m2_sv, qhbits_sv);
5241+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
52435242
5244- scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4),svdup_n_s32((int32_t)scale[0]),svdup_n_s32((int32_t)scale[1]));
5245- sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),scale_1);
5243+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
5244+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
52465245
5247- q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32),svbic_u8_x(svptrue_pat_b8(SV_VL32),m3_sv, qhbits_sv),1);
5248- q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32),svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32),svlsr_n_u8_x(svptrue_pat_b8(SV_VL32),q3bits_sv,6),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5246+ q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m3_sv, qhbits_sv), 1);
5247+ q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
52495248
5250- scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4),svdup_n_s32((int32_t)scale[2]),svdup_n_s32((int32_t)scale[3]));
5251- sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),scale_1);
5249+ scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
5250+ sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
52525251
52535252
5254- if(j==0)
5255- {
5256- qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32),qhbits_sv,4);
5253+ if (j==0) {
5254+ qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), qhbits_sv, 4);
52575255 }
52585256
52595257 scale += 4;
52605258
52615259 }
52625260
52635261 sum += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), sumi1_1));
5264- }break;
5262+ } break;
52655263 default:
5266- assert(false && "Unsupported vector length");
5267- break;
5264+ assert(false && "Unsupported vector length");
5265+ break;
52685266 }
52695267 }
52705268 *s = sum;
0 commit comments