Skip to content

Commit d36e61c

Browse files
authored
ggml-cpu: clean up s390x SIMD (ggml-org#15855)
* ggml-cpu: clean up s390x simd Signed-off-by: Aaron Teo <[email protected]> (cherry picked from commit 0da4b6a) Signed-off-by: Aaron Teo <[email protected]> * ggml-cpu: fix hsum data types Signed-off-by: Aaron Teo <[email protected]> --------- Signed-off-by: Aaron Teo <[email protected]>
1 parent c97b5e5 commit d36e61c

File tree

2 files changed

+63
-60
lines changed

2 files changed

+63
-60
lines changed

ggml/src/ggml-cpu/arch/s390/quants.c

Lines changed: 57 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
5353

5454
#if defined(__VXE__) || defined(__VXE2__)
5555
for (int i = 0; i < nb; i++) {
56-
__vector float srcv [8];
57-
__vector float asrcv[8];
58-
__vector float amaxv[8];
56+
float32x4_t srcv [8];
57+
float32x4_t asrcv[8];
58+
float32x4_t amaxv[8];
5959

6060
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
6161
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -74,8 +74,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
7474
y[i].d = GGML_CPU_FP32_TO_FP16(d);
7575

7676
for (int j = 0; j < 8; j++) {
77-
const __vector float v = vec_mul(srcv[j], vec_splats(id));
78-
const __vector int32_t vi = vec_signed(v);
77+
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
78+
const int32x4_t vi = vec_signed(v);
7979

8080
y[i].qs[4*j + 0] = vec_extract(vi, 0);
8181
y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -98,9 +98,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
9898

9999
#if defined(__VXE__) || defined(__VXE2__)
100100
for (int i = 0; i < nb; i++) {
101-
__vector float srcv [8];
102-
__vector float asrcv[8];
103-
__vector float amaxv[8];
101+
float32x4_t srcv [8];
102+
float32x4_t asrcv[8];
103+
float32x4_t amaxv[8];
104104

105105
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
106106
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -118,11 +118,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
118118

119119
y[i].d = GGML_CPU_FP32_TO_FP16(d);
120120

121-
__vector int32_t acc = vec_splats(0);
121+
int32x4_t acc = vec_splats(0);
122122

123123
for (int j = 0; j < 8; j++) {
124-
const __vector float v = vec_mul(srcv[j], vec_splats(id));
125-
const __vector int32_t vi = vec_signed(v);
124+
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
125+
const int32x4_t vi = vec_signed(v);
126126

127127
y[i].qs[4*j + 0] = vec_extract(vi, 0);
128128
y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -162,37 +162,36 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
162162
float sumf = 0;
163163

164164
#if defined(__VXE__) || defined(__VXE2__)
165-
__vector float acc = vec_splats(0.0f);
165+
float32x4_t acc = vec_splats(0.0f);
166166

167-
const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
168-
const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
167+
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
168+
const int8x16_t v_s = vec_splats( (const int8_t)0x08);
169169

170170
for (; ib < nb; ++ib) {
171-
const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
172-
const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
173-
const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
171+
const uint8x16_t v_x = vec_xl(0, x[ib].qs);
172+
const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
173+
const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
174174

175-
const __vector int8_t v_xls = vec_sub(v_xl, v_s);
176-
const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
175+
const int8x16_t v_xls = vec_sub(v_xl, v_s);
176+
const int8x16_t v_xhs = vec_sub(v_xh, v_s);
177177

178-
const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
179-
const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
178+
const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
179+
const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
180180

181-
const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
182-
const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
183-
const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
184-
const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
181+
const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
182+
const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
183+
const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
184+
const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
185185

186-
__vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
186+
int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
187187

188-
const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
189-
const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
188+
const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
189+
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
190190

191191
acc = vec_madd(v_xy, v_d, acc);
192192
}
193193

194-
sumf = acc[0] + acc[1] + acc[2] + acc[3];
195-
194+
sumf = vec_hsum_f32x4(acc);
196195
*s = sumf;
197196
#else
198197
UNUSED(nb);
@@ -249,8 +248,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
249248
acc = vec_madd(v_xy, v_d, acc);
250249
}
251250

252-
sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
253-
251+
sumf = vec_hsum_f32x4(acc) + summs;
254252
*s = sumf;
255253
#else
256254
UNUSED(nb);
@@ -351,7 +349,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
351349
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
352350
}
353351

354-
sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
352+
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
355353

356354
#pragma GCC unroll 4
357355
for (; ib < nb; ++ib) {
@@ -390,7 +388,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
390388
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
391389
const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
392390

393-
sumf += vec_hsum(v_acc);
391+
sumf += vec_hsum_f32x4(v_acc);
394392
}
395393

396394
*s = sumf;
@@ -502,7 +500,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
502500
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
503501
}
504502

505-
sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
503+
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
506504

507505
#pragma GCC unroll 4
508506
for (; ib < nb; ++ib) {
@@ -543,7 +541,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
543541
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
544542
const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
545543

546-
sumf += vec_hsum(v_acc) + summs;
544+
sumf += vec_hsum_f32x4(v_acc) + summs;
547545
}
548546

549547
*s = sumf;
@@ -575,7 +573,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
575573
float sumf = 0;
576574

577575
#if defined(__VXE__) || defined(__VXE2__)
578-
__vector float acc = vec_splats(0.0f);
576+
float32x4_t acc = vec_splats(0.0f);
579577

580578
#pragma GCC unroll 8
581579
for (; ib < nb; ++ib) {
@@ -594,7 +592,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
594592
acc = vec_madd(v_xy, v_d, acc);
595593
}
596594

597-
sumf = acc[0] + acc[1] + acc[2] + acc[3];
595+
sumf = vec_hsum_f32x4(acc);
598596

599597
*s = sumf;
600598
#else
@@ -718,10 +716,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
718716
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
719717
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
720718

721-
isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
722-
isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
723-
isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
724-
isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
719+
isum += vec_hsum_i32x4(isum0) * scale[0];
720+
isum += vec_hsum_i32x4(isum1) * scale[1];
721+
isum += vec_hsum_i32x4(isum2) * scale[2];
722+
isum += vec_hsum_i32x4(isum3) * scale[3];
725723

726724
scale += 4;
727725

@@ -819,7 +817,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
819817
v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
820818

821819
const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
822-
sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
820+
sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
823821

824822
v_y[0] = vec_xl(0 , y0);
825823
v_y[1] = vec_xl(16, y0);
@@ -829,7 +827,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
829827
v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
830828

831829
const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
832-
sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
830+
sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
833831
}
834832

835833
sumf += d * (sumi1 + sumi2);
@@ -911,7 +909,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
911909
const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
912910
const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
913911
const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
914-
const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
912+
const int32_t mins = vec_hsum_i32x4(v_mins);
915913

916914
const uint8_t * scales = (const uint8_t *)utmp;
917915
const uint8_t * GGML_RESTRICT x0l = x[i].qs;
@@ -948,8 +946,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
948946
int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
949947
int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
950948

951-
sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
952-
sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
949+
sumi += vec_hsum_i32x4(sumi0) * *scales++;
950+
sumi += vec_hsum_i32x4(sumi1) * *scales++;
953951
}
954952

955953
sumf += d * sumi - dmin * mins;
@@ -1020,7 +1018,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
10201018
const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
10211019
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
10221020

1023-
const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
1021+
const int32_t mins = vec_hsum_i32x4(v_mins);
10241022

10251023
int32_t isum = 0;
10261024
for (int j = 0; j < QK_K/128; ++j) {
@@ -1060,10 +1058,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
10601058
int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
10611059
int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
10621060

1063-
isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
1064-
(summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
1065-
(summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
1066-
(summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
1061+
isum += vec_hsum_i32x4(summs0) * scale[0] +
1062+
vec_hsum_i32x4(summs1) * scale[1] +
1063+
vec_hsum_i32x4(summs2) * scale[2] +
1064+
vec_hsum_i32x4(summs3) * scale[3];
10671065

10681066
scale += 4;
10691067

@@ -1094,10 +1092,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
10941092
summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
10951093
summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
10961094

1097-
isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
1098-
(summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
1099-
(summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
1100-
(summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
1095+
isum += vec_hsum_i32x4(summs0) * scale[0] +
1096+
vec_hsum_i32x4(summs1) * scale[1] +
1097+
vec_hsum_i32x4(summs2) * scale[2] +
1098+
vec_hsum_i32x4(summs3) * scale[3];
11011099

11021100
scale += 4;
11031101
}
@@ -1285,7 +1283,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
12851283
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
12861284
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
12871285

1288-
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
1286+
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
12891287
}
12901288

12911289
*s = sumf;
@@ -1354,8 +1352,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
13541352

13551353
h >>= 4;
13561354

1357-
sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
1358-
sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
1355+
sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
1356+
sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
13591357
}
13601358

13611359
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,11 +483,16 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
483483
/**
484484
* @see https://github.com/ggml-org/llama.cpp/pull/14037
485485
*/
486-
inline static float vec_hsum(float32x4_t v) {
486+
inline static float vec_hsum_f32x4(float32x4_t v) {
487487
float32x4_t v_temp = v + vec_reve(v);
488488
return v_temp[0] + v_temp[1];
489489
}
490490

491+
inline static int32_t vec_hsum_i32x4(int32x4_t v) {
492+
int32x4_t v_temp = v + vec_reve(v);
493+
return v_temp[0] + v_temp[1];
494+
}
495+
491496
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
492497
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
493498
return acc + (vec_unpackh(p) + vec_unpackl(p));

0 commit comments

Comments
 (0)