Skip to content

Commit 0b43956

Browse files
committed
remove trailing whitespaces
1 parent de8387e commit 0b43956

File tree

1 file changed

+76
-76
lines changed

1 file changed

+76
-76
lines changed

ggml/src/ggml-cpu/ggml-cpu-quants.c

Lines changed: 76 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -6144,70 +6144,70 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
61446144
float sumf = 0;
61456145
if (__riscv_vlenb() >= 32) {
61466146
for (int i = 0; i < nb; ++i) {
6147-
6147+
61486148
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
61496149
const uint8_t * GGML_RESTRICT qh = x[i].hmask;
61506150
const int8_t * GGML_RESTRICT q8 = y[i].qs;
6151-
6151+
61526152
memcpy(aux, x[i].scales, 12);
61536153
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
61546154
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
61556155
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
61566156
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
6157-
6157+
61586158
int8_t * scale = (int8_t *)utmp;
61596159
for (int j = 0; j < 16; ++j) scale[j] -= 32;
6160-
6161-
6160+
6161+
61626162
size_t vl = 32;
61636163
uint8_t m = 1;
6164-
6164+
61656165
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
61666166
vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
6167-
6167+
61686168
int sum_t = 0;
6169-
6169+
61706170
for (int j = 0; j < QK_K; j += 128) {
6171-
6171+
61726172
vl = 32;
6173-
6173+
61746174
// load Q3
61756175
vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
6176-
6176+
61776177
vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
61786178
vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
61796179
vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
61806180
vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
6181-
6181+
61826182
// compute mask for subtraction
61836183
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
61846184
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
61856185
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
61866186
m <<= 1;
6187-
6187+
61886188
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
61896189
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
61906190
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
61916191
m <<= 1;
6192-
6192+
61936193
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
61946194
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
61956195
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
61966196
m <<= 1;
6197-
6197+
61986198
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
61996199
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
62006200
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
62016201
m <<= 1;
6202-
6202+
62036203
// load Q8 and take product with Q3
62046204
vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
62056205
vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
62066206
vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
62076207
vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
6208-
6208+
62096209
vl = 16;
6210-
6210+
62116211
// retrieve lane to multiply with scale
62126212
vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
62136213
vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
@@ -6217,22 +6217,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
62176217
vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
62186218
vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
62196219
vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
6220-
6220+
62216221
vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
62226222
vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
62236223
vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
62246224
vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
6225-
6225+
62266226
sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
6227-
6227+
62286228
q3 += 32; q8 += 128; scale += 8;
6229-
6229+
62306230
}
6231-
6231+
62326232
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
6233-
6233+
62346234
sumf += d*sum_t;
6235-
6235+
62366236
}
62376237
} else if (__riscv_vlenb() == 16) {
62386238
for (int i = 0; i < nb; ++i) {
@@ -7071,64 +7071,64 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
70717071
for (int i = 0; i < nb; ++i) {
70727072

70737073
size_t vl = 8;
7074-
7074+
70757075
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
70767076
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
7077-
7077+
70787078
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
70797079
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
70807080
vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
7081-
7081+
70827082
memcpy(utmp, x[i].scales, 12);
70837083
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
70847084
const uint32_t uaux = utmp[1] & kmask1;
70857085
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
70867086
utmp[2] = uaux;
70877087
utmp[0] &= kmask1;
7088-
7088+
70897089
vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
70907090
vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
70917091
vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
7092-
7092+
70937093
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
70947094
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
7095-
7095+
70967096
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
70977097
const int8_t * GGML_RESTRICT q8 = y[i].qs;
7098-
7098+
70997099
vl = 32;
7100-
7100+
71017101
int32_t sum_1 = 0;
71027102
int32_t sum_2 = 0;
7103-
7103+
71047104
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
7105-
7105+
71067106
for (int j = 0; j < QK_K/64; ++j) {
71077107
// load Q4
71087108
vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
7109-
7109+
71107110
// load Q8 and multiply it with lower Q4 nibble
71117111
vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
71127112
vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
71137113
vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
71147114
vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
7115-
7115+
71167116
sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
7117-
7117+
71187118
// load Q8 and multiply it with upper Q4 nibble
71197119
vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
71207120
vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
71217121
vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
71227122
vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
7123-
7123+
71247124
sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
7125-
7125+
71267126
q4 += 32; q8 += 64;
7127-
7127+
71287128
}
7129-
7129+
71307130
sumf += d*(sum_1 + sum_2);
7131-
7131+
71327132
}
71337133
} else if (__riscv_vlenb() == 16) {
71347134
for (int i = 0; i < nb; ++i) {
@@ -7180,13 +7180,13 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
71807180
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
71817181
);
71827182
sumf -= dmin * sumi;
7183-
7183+
71847184
const uint8_t * restrict q4 = x[i].qs;
71857185
const int8_t * restrict q8 = y[i].qs;
7186-
7186+
71877187
sumi = 0;
71887188
const uint8_t * scale = scales;
7189-
7189+
71907190
for (int j = 0; j < QK_K/128; ++j) {
71917191
int vl128 = 128, vl64 = 64, vl32 = 32;
71927192
__asm__ __volatile__(
@@ -7230,7 +7230,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
72307230

72317231
q4 += 64; q8 += 128; scale += 4;
72327232
}
7233-
7233+
72347234
sumf += d * sumi;
72357235
}
72367236
}
@@ -8918,59 +8918,59 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
89188918
for (int i = 0; i < nb; ++i) {
89198919

89208920
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8921-
8921+
89228922
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
89238923
const uint8_t * GGML_RESTRICT qh = x[i].qh;
89248924
const int8_t * GGML_RESTRICT q8 = y[i].qs;
8925-
8925+
89268926
const int8_t * GGML_RESTRICT scale = x[i].scales;
8927-
8927+
89288928
size_t vl;
8929-
8929+
89308930
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
8931-
8931+
89328932
int sum_t = 0;
89338933
int is = 0;
8934-
8934+
89358935
for (int j = 0; j < QK_K/128; ++j) {
8936-
8936+
89378937
vl = 32;
8938-
8938+
89398939
// load qh
89408940
vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
8941-
8941+
89428942
// load Q6
89438943
vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
89448944
vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
8945-
8945+
89468946
vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
89478947
vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
89488948
vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
89498949
vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
8950-
8950+
89518951
vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
89528952
vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
89538953
vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
89548954
vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
8955-
8955+
89568956
vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
89578957
vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
89588958
vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
89598959
vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
8960-
8960+
89618961
vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
89628962
vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
89638963
vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
89648964
vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
8965-
8965+
89668966
// load Q8 and take product
89678967
vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
89688968
vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
89698969
vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
89708970
vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
8971-
8971+
89728972
vl = 16;
8973-
8973+
89748974
vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
89758975
vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
89768976
vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
@@ -8979,35 +8979,35 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
89798979
vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
89808980
vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
89818981
vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
8982-
8982+
89838983
vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
89848984
vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
89858985
vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
89868986
vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
8987-
8987+
89888988
sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
8989-
8989+
89908990
q6 += 64; qh += 32; q8 += 128; is=8;
8991-
8991+
89928992
}
8993-
8993+
89948994
sumf += d * sum_t;
8995-
8995+
89968996
}
89978997
} else if (__riscv_vlenb() == 16) {
89988998
for (int i = 0; i < nb; ++i) {
8999-
8999+
90009000
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9001-
9001+
90029002
const uint8_t * restrict q6 = x[i].ql;
90039003
const uint8_t * restrict qh = x[i].qh;
90049004
const int8_t * restrict q8 = y[i].qs;
9005-
9005+
90069006
const int8_t * restrict scale = x[i].scales;
9007-
9007+
90089008
int sum_t = 0;
90099009
int t0;
9010-
9010+
90119011
for (int j = 0; j < QK_K/128; ++j) {
90129012
__asm__ __volatile__(
90139013
"vsetvli zero, %[vl32], e8, m2\n\t"
@@ -9063,9 +9063,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
90639063
);
90649064
q6 += 64; qh += 32; q8 += 128; scale += 8;
90659065
}
9066-
9066+
90679067
sumf += d * sum_t;
9068-
9068+
90699069
}
90709070
}
90719071

0 commit comments

Comments
 (0)