Skip to content

Commit 3b10dff

Browse files
committed
Improved Formating of code in ggml-cpu-quants.c file
1 parent e2fdc47 commit 3b10dff

File tree

1 file changed

+72
-74
lines changed

1 file changed

+72
-74
lines changed

ggml/src/ggml-cpu/ggml-cpu-quants.c

Lines changed: 72 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -5098,10 +5098,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
50985098
const svint32_t vzero_sv = svdup_n_s32(0);
50995099

51005100
const svuint8_t m0_sv = svdup_n_u8(1);
5101-
const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(),m0_sv,1);
5102-
const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(),m0_sv,2);
5103-
const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(),m0_sv,3);
5104-
svbool_t pred_s32 = svnot_b_z (svptrue_b32(),svptrue_pat_b32(SV_VL4));
5101+
const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1);
5102+
const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2);
5103+
const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3);
5104+
svbool_t pred_s32 = svnot_b_z (svptrue_b32(), svptrue_pat_b32(SV_VL4));
51055105

51065106
float sum = 0;
51075107

@@ -5124,147 +5124,145 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
51245124

51255125
for (int j = 0; j < 16; ++j) scale[j] -= m32;
51265126

5127-
switch(vector_length){
5127+
switch (vector_length) {
51285128
case 128:
51295129
{
5130-
svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(),qh_sv);
5131-
svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(),qh_sv+16);
5130+
svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(), qh_sv);
5131+
svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(), qh_sv+16);
51325132
svuint8_t q3h_sv;
51335133

51345134
svint32_t sumi1_1 = svdup_n_s32(0);
51355135
svint8_t q3bytes_sv;
51365136

51375137
for (int j = 0; j < QK_K/128; ++j) {
51385138

5139-
const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(),q3_sv); q3_sv += 16;
5140-
const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(),q3_sv); q3_sv += 16;
5141-
svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5142-
svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5139+
const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
5140+
const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
5141+
svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5142+
svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
51435143

5144-
q3h_sv = svlsl_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m0_sv, qhbits_sv_1),2);
5145-
q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),q3bits_sv,m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5144+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_1), 2);
5145+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51465146

5147-
sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),svdup_n_s32((int32_t)scale[0]));
5147+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
51485148

5149-
q3h_sv = svlsl_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m0_sv, qhbits_sv_2),2);
5150-
q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),q3bits_sv_1,m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5149+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_2), 2);
5150+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv_1, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51515151

5152-
sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),svdup_n_s32((int32_t)scale[1]));
5152+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
51535153

5154-
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5155-
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5154+
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5155+
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
51565156

5157-
q3h_sv = svlsl_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m1_sv, qhbits_sv_1),1);
5158-
q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv,2),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5157+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_1), 1);
5158+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51595159

5160-
sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),svdup_n_s32((int32_t)scale[2]));
5160+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
51615161

5162-
q3h_sv = svlsl_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m1_sv, qhbits_sv_2),1);
5163-
q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv_1,2),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5162+
q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_2), 1);
5163+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51645164

5165-
sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),svdup_n_s32((int32_t)scale[3]));
5165+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
51665166

51675167

51685168
scale += 4;
5169-
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5170-
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5169+
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5170+
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
51715171

5172-
q3h_sv = svbic_u8_x(svptrue_b8(),m2_sv, qhbits_sv_1);
5173-
q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv,4),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5172+
q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_1);
5173+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51745174

5175-
sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),svdup_n_s32((int32_t)scale[0]));
5175+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
51765176

5177-
q3h_sv = svbic_u8_x(svptrue_b8(),m2_sv, qhbits_sv_2);
5178-
q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv_1,4),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5177+
q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_2);
5178+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51795179

5180-
sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),svdup_n_s32((int32_t)scale[1]));
5180+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
51815181

51825182

5183-
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5184-
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(),q8_sv); q8_sv += 16;
5183+
q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
5184+
q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
51855185

5186-
q3h_sv = svlsr_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m3_sv, qhbits_sv_1),1);
5187-
q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv,6),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5186+
q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_1), 1);
5187+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51885188

5189-
sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),svdup_n_s32((int32_t)scale[2]));
5189+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
51905190

5191-
q3h_sv = svlsr_n_u8_x(svptrue_b8(),svbic_u8_x(svptrue_b8(),m3_sv, qhbits_sv_2),1);
5192-
q3bytes_sv = svsub_s8_x(svptrue_b8(),svreinterpret_s8_u8(svand_u8_m(svptrue_b8(),svlsr_n_u8_x(svptrue_b8(),q3bits_sv_1,6),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5191+
q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_2), 1);
5192+
q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
51935193

5194-
sumi1_1 = svmla_s32_m(svptrue_b32(),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),svdup_n_s32((int32_t)scale[3]));
5194+
sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
51955195

51965196

5197-
if(j==0)
5198-
{
5199-
qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(),qhbits_sv_1,4);
5200-
qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(),qhbits_sv_2,4);
5197+
if (j==0) {
5198+
qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_1, 4);
5199+
qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_2, 4);
52015200
}
52025201

52035202
scale += 4;
52045203

52055204
}
52065205

52075206
sum += d * (svaddv_s32(svptrue_b32(), sumi1_1));
5208-
}break;
5207+
} break;
52095208
case 256:
52105209
case 512:
52115210
{
5212-
svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32),qh_sv);
5211+
svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), qh_sv);
52135212
svuint8_t q3h_sv;
52145213

52155214
svint32_t sumi1_1 = svdup_n_s32(0);
52165215
svint8_t q3bytes_sv;
52175216

52185217
for (int j = 0; j < QK_K/128; ++j) {
52195218

5220-
const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32),q3_sv); q3_sv += 32;
5221-
svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32),q8_sv); q8_sv += 32;
5222-
svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32),q8_sv); q8_sv += 32;
5219+
const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), q3_sv); q3_sv += 32;
5220+
svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5221+
svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
52235222

5224-
q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32),svbic_u8_x(svptrue_pat_b8(SV_VL32),m0_sv, qhbits_sv),2);
5225-
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32),svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32),q3bits_sv,m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5223+
q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m0_sv, qhbits_sv), 2);
5224+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
52265225

52275226

5228-
svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4),svdup_n_s32((int32_t)scale[0]),svdup_n_s32((int32_t)scale[1]));
5229-
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),scale_1);
5227+
svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
5228+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
52305229

5231-
q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32),svbic_u8_x(svptrue_pat_b8(SV_VL32),m1_sv, qhbits_sv),1);
5232-
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32),svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32),svlsr_n_u8_x(svptrue_pat_b8(SV_VL32),q3bits_sv,2),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5230+
q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m1_sv, qhbits_sv), 1);
5231+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
52335232

5234-
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4),svdup_n_s32((int32_t)scale[2]),svdup_n_s32((int32_t)scale[3]));
5235-
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),scale_1);
5233+
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
5234+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
52365235

52375236
scale += 4;
5238-
q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32),q8_sv); q8_sv += 32;
5239-
q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32),q8_sv); q8_sv += 32;
5237+
q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
5238+
q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
52405239

5241-
q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32),m2_sv, qhbits_sv);
5242-
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32),svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32),svlsr_n_u8_x(svptrue_pat_b8(SV_VL32),q3bits_sv,4),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5240+
q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32), m2_sv, qhbits_sv);
5241+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
52435242

5244-
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4),svdup_n_s32((int32_t)scale[0]),svdup_n_s32((int32_t)scale[1]));
5245-
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1),scale_1);
5243+
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
5244+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
52465245

5247-
q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32),svbic_u8_x(svptrue_pat_b8(SV_VL32),m3_sv, qhbits_sv),1);
5248-
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32),svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32),svlsr_n_u8_x(svptrue_pat_b8(SV_VL32),q3bits_sv,6),m3b_sv)), svreinterpret_s8_u8(q3h_sv));
5246+
q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m3_sv, qhbits_sv), 1);
5247+
q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
52495248

5250-
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4),svdup_n_s32((int32_t)scale[2]),svdup_n_s32((int32_t)scale[3]));
5251-
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8),sumi1_1,svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2),scale_1);
5249+
scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
5250+
sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
52525251

52535252

5254-
if(j==0)
5255-
{
5256-
qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32),qhbits_sv,4);
5253+
if (j==0) {
5254+
qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), qhbits_sv, 4);
52575255
}
52585256

52595257
scale += 4;
52605258

52615259
}
52625260

52635261
sum += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), sumi1_1));
5264-
}break;
5262+
} break;
52655263
default:
5266-
assert(false && "Unsupported vector length");
5267-
break;
5264+
assert(false && "Unsupported vector length");
5265+
break;
52685266
}
52695267
}
52705268
*s = sum;

0 commit comments

Comments
 (0)