From e5a946943b5999796fe0574489e2549560ca7448 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Thu, 21 Aug 2025 01:08:53 +0800 Subject: [PATCH 01/12] ggml-cpu: initial q5_0 impl for s390x Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/arch-fallback.h | 1 - ggml/src/ggml-cpu/arch/s390/quants.c | 96 ++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 0bfb92df17909..1e15249a294ce 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -150,7 +150,6 @@ #elif defined(__s390x__) // quants.c #define quantize_row_q8_K_generic quantize_row_q8_K -#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0 #define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 7e4229d0e46a9..9df491e17b139 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -241,6 +241,102 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0.0f; + +#if defined(__VXE__) || defined(__VXE2__) + float32x4_t acc = vec_splats(0.0f); + + const uint8x16_t v_m = vec_splats((uint8_t)0x0F); + const uint16x8_t v_ml = { 1, 2, 4, 8, 16, 32, 64, 128 }; + const uint16x8_t v_mh = { 256, 512, 1024, 2048, 4096, 8192, 16384, 32768 }; + const uint16x8_t v_z = vec_splats((uint16_t)0); + + #pragma GCC unroll 8 + for (; ib < nb; ++ib) { + // Load 32-bit high flags (5th bit) into a 32-bit integer + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const uint16_t qh_e = qh & 0xFFFF; + const uint16_t qh_o = qh >> 16; + + const uint16x8_t v_qhe = vec_splats(qh_e); + const uint16x8_t v_qhel = vec_and(v_qhe, v_ml); + const uint16x8_t v_qheh = vec_and(v_qhe, v_mh); + const uint16x8_t v_mel = vec_cmpeq(v_qhel, v_z); + const uint16x8_t v_meh = vec_cmpeq(v_qheh, v_z); + + const uint16x8_t v_cel = vec_sr(v_mel, vec_splats((uint16_t)15)); + const uint16x8_t v_ceh = vec_sr(v_meh, vec_splats((uint16_t)15)); + + const uint16x8_t v_subel = vec_mul(v_cel, vec_splats((uint16_t)0x10)); + const uint16x8_t v_subeh = vec_mul(v_ceh, vec_splats((uint16_t)0x10)); + + const uint8x16_t v_qhep = vec_pack(v_subel, v_subeh); + + const uint16x8_t v_qho = vec_splats(qh_o); + const uint16x8_t v_qhol = vec_and(v_qho, v_ml); + const uint16x8_t v_qhoh = vec_and(v_qho, v_mh); + const uint16x8_t v_mol = vec_cmpeq(v_qhol, v_z); + const uint16x8_t v_moh = vec_cmpeq(v_qhoh, v_z); + + const uint16x8_t v_col = vec_sr(v_mol, vec_splats((uint16_t)15)); + const uint16x8_t v_coh = vec_sr(v_moh, vec_splats((uint16_t)15)); + + const uint16x8_t v_subol = vec_mul(v_col, vec_splats((uint16_t)0x10)); + const uint16x8_t v_suboh = vec_mul(v_coh, vec_splats((uint16_t)0x10)); + + const uint8x16_t v_qhop = vec_pack(v_subol, v_suboh); + + + const uint8x16_t v_x = vec_xl(0, x[ib].qs); + const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); + const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, vec_splats((uint8_t)4)); + + const int8x16_t v_xlf = vec_sub(v_xl, (int8x16_t)v_qhep); + const int8x16_t v_xhf = vec_sub(v_xh, (int8x16_t)v_qhop); + + const int8x16_t v_yl = vec_xl(0, y[ib].qs); + const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs); + + const int32x4_t v_xy_ = ggml_vec_dot( + ggml_vec_dot(vec_splats(0), v_xlf, v_yl), + v_xhf, v_yh); + const float32x4_t v_xy = vec_float(v_xy_); + + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * + GGML_CPU_FP16_TO_FP32(y[ib].d)); + + acc = vec_madd(v_xy, v_d, acc); + } + + sumf = acc[0] + acc[1] + acc[2] + acc[3]; + *s = sumf; +#else + UNUSED(ib); + UNUSED(x); + UNUSED(y); + UNUSED(sumf); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); +#endif +} + void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; From 15067372e9a99e173588d199e9690ad2d251cfb0 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Thu, 21 Aug 2025 14:46:44 +0800 Subject: [PATCH 02/12] ggml-cpu: updated q5_0 code for better performance Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/arch/s390/quants.c | 157 ++++++++++++++++++--------- 1 file changed, 106 insertions(+), 51 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 9df491e17b139..180367cc0b95c 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -23,6 +23,21 @@ #define UNUSED GGML_UNUSED +#if defined(__VXE__) || defined(__VXE2__) +#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s +#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) +#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) +#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) +#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) +#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) +#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) +#define B8(c,s ) B7(c,s, c), B7(c,s, s) + +// precomputed tables for expanding 8bits to 8 bytes: +// static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4 +static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 +#endif + void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); @@ -260,73 +275,113 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0.0f; #if defined(__VXE__) || defined(__VXE2__) - float32x4_t acc = vec_splats(0.0f); - const uint8x16_t v_m = vec_splats((uint8_t)0x0F); - const uint16x8_t v_ml = { 1, 2, 4, 8, 16, 32, 64, 128 }; - const uint16x8_t v_mh = { 256, 512, 1024, 2048, 4096, 8192, 16384, 32768 }; - const uint16x8_t v_z = vec_splats((uint16_t)0); + const uint8x16_t v_kperm = (const uint8x16_t){ 7, 6, 5, 4, 3, 2, 1, 0, + 15, 14, 13, 12, 11, 10, 9, 8 }; + + float32x4_t v_sum0 = vec_splats(0.0f); + float32x4_t v_sum1 = vec_splats(0.0f); + + #pragma GCC unroll 8 + for (; ib + 1 < nb; ib += 2) { + const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + uint32_t qh0, qh1; + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + uint64_t tmp0[4], tmp1[4]; + tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_1[(qh0 >> 24) ]; + tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_1[(qh1 >> 24) ]; + + int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0)); + int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2)); + int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0)); + int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2)); + + v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm); + v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm); + v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm); + v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm); + + uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs); + uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs); + + uint8x16_t v_x0l = vec_and(v_x0, v_m); + uint8x16_t v_x0h = vec_sr(v_x0, vec_splats((uint8_t)0x04)); + uint8x16_t v_x1l = vec_and(v_x1, v_m); + uint8x16_t v_x1h = vec_sr(v_x1, vec_splats((uint8_t)0x04)); + + int8x16_t v_x0lf = vec_sub((int8x16_t)v_x0l, v_qh0l); + int8x16_t v_x0hf = vec_sub((int8x16_t)v_x0h, v_qh0h); + int8x16_t v_x1lf = vec_sub((int8x16_t)v_x1l, v_qh1l); + int8x16_t v_x1hf = vec_sub((int8x16_t)v_x1h, v_qh1h); + + int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs); + int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs); + int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs); + int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs); + + int32x4_t v_sums0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); + int32x4_t v_sums1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); + + float32x4_t v_sums0f = vec_float(v_sums0); + float32x4_t v_sums1f = vec_float(v_sums1); + + const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); + const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d)); + + v_sum0 = vec_madd(v_sums0f, v_d0, v_sum0); + v_sum1 = vec_madd(v_sums1f, v_d1, v_sum1); + } + + float32x4_t v_sumv = vec_add(v_sum0, v_sum1); + sumf += v_sumv[0] + v_sumv[1] + v_sumv[2] + v_sumv[3]; #pragma GCC unroll 8 for (; ib < nb; ++ib) { - // Load 32-bit high flags (5th bit) into a 32-bit integer uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); - const uint16_t qh_e = qh & 0xFFFF; - const uint16_t qh_o = qh >> 16; - - const uint16x8_t v_qhe = vec_splats(qh_e); - const uint16x8_t v_qhel = vec_and(v_qhe, v_ml); - const uint16x8_t v_qheh = vec_and(v_qhe, v_mh); - const uint16x8_t v_mel = vec_cmpeq(v_qhel, v_z); - const uint16x8_t v_meh = vec_cmpeq(v_qheh, v_z); - - const uint16x8_t v_cel = vec_sr(v_mel, vec_splats((uint16_t)15)); - const uint16x8_t v_ceh = vec_sr(v_meh, vec_splats((uint16_t)15)); + uint64_t tmp[4]; + tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_1[(qh >> 24) ]; - const uint16x8_t v_subel = vec_mul(v_cel, vec_splats((uint16_t)0x10)); - const uint16x8_t v_subeh = vec_mul(v_ceh, vec_splats((uint16_t)0x10)); + int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0)); + int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2)); + v_qhl = vec_perm(v_qhl, v_qhl, v_kperm); + v_qhh = vec_perm(v_qhh, v_qhh, v_kperm); - const uint8x16_t v_qhep = vec_pack(v_subel, v_subeh); + uint8x16_t v_x = vec_xl(0, (const uint8_t *)x[ib].qs); + uint8x16_t v_xl = vec_and(v_x, v_m); + uint8x16_t v_xh = vec_sr(v_x, vec_splats((uint8_t)0x04)); - const uint16x8_t v_qho = vec_splats(qh_o); - const uint16x8_t v_qhol = vec_and(v_qho, v_ml); - const uint16x8_t v_qhoh = vec_and(v_qho, v_mh); - const uint16x8_t v_mol = vec_cmpeq(v_qhol, v_z); - const uint16x8_t v_moh = vec_cmpeq(v_qhoh, v_z); + int8x16_t v_xlf = vec_sub((int8x16_t)v_xl, v_qhl); + int8x16_t v_xhf = vec_sub((int8x16_t)v_xh, v_qhh); - const uint16x8_t v_col = vec_sr(v_mol, vec_splats((uint16_t)15)); - const uint16x8_t v_coh = vec_sr(v_moh, vec_splats((uint16_t)15)); + int8x16_t v_yl = vec_xl(0, (const int8_t *)y[ib].qs); + int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y[ib].qs); - const uint16x8_t v_subol = vec_mul(v_col, vec_splats((uint16_t)0x10)); - const uint16x8_t v_suboh = vec_mul(v_coh, vec_splats((uint16_t)0x10)); + int32x4_t v_sums = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh); + float32x4_t v_sumsf = vec_float(v_sums); - const uint8x16_t v_qhop = vec_pack(v_subol, v_suboh); - - - const uint8x16_t v_x = vec_xl(0, x[ib].qs); - const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); - const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, vec_splats((uint8_t)4)); - - const int8x16_t v_xlf = vec_sub(v_xl, (int8x16_t)v_qhep); - const int8x16_t v_xhf = vec_sub(v_xh, (int8x16_t)v_qhop); - - const int8x16_t v_yl = vec_xl(0, y[ib].qs); - const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs); - - const int32x4_t v_xy_ = ggml_vec_dot( - ggml_vec_dot(vec_splats(0), v_xlf, v_yl), - v_xhf, v_yh); - const float32x4_t v_xy = vec_float(v_xy_); - - const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * - GGML_CPU_FP16_TO_FP32(y[ib].d)); + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); + float32x4_t acc = vec_madd(v_sumsf, v_d, vec_splats(0.0f)); - acc = vec_madd(v_xy, v_d, acc); + sumf += acc[0] + acc[1] + acc[2] + acc[3]; } - sumf = acc[0] + acc[1] + acc[2] + acc[3]; *s = sumf; #else UNUSED(ib); From d02fbd8edee4af56330bbc88d3b5763db81657ed Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Thu, 21 Aug 2025 15:01:25 +0800 Subject: [PATCH 03/12] ggml-cpu: use optimised hsum for better performance Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/arch/s390/quants.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 180367cc0b95c..d01e54fd2efac 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -345,7 +345,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } float32x4_t v_sumv = vec_add(v_sum0, v_sum1); - sumf += v_sumv[0] + v_sumv[1] + v_sumv[2] + v_sumv[3]; + float32x4_t v_temp = v_sumv + vec_reve(v_sumv); // Optimised hsum + sumf += v_temp[0] + v_temp[1]; #pragma GCC unroll 8 for (; ib < nb; ++ib) { @@ -379,7 +380,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); float32x4_t acc = vec_madd(v_sumsf, v_d, vec_splats(0.0f)); - sumf += acc[0] + acc[1] + acc[2] + acc[3]; + float32x4_t v_temp = acc + vec_reve(acc); // Optimised hsum + sumf += v_temp[0] + v_temp[1]; } *s = sumf; From dd6deeffa0fed9435f8713e11280d9a216718cdf Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 22 Aug 2025 01:31:28 +0800 Subject: [PATCH 04/12] ggml-cpu: introduce q5_1 simd + refactor q5_0 Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/arch-fallback.h | 1 - ggml/src/ggml-cpu/arch/s390/quants.c | 238 ++++++++++++++++++++++----- ggml/src/ggml-cpu/ggml-cpu-impl.h | 8 + 3 files changed, 206 insertions(+), 41 deletions(-) diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 1e15249a294ce..373408a9c0955 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -150,7 +150,6 @@ #elif defined(__s390x__) // quants.c #define quantize_row_q8_K_generic quantize_row_q8_K -#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index d01e54fd2efac..60484857ec344 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -275,13 +275,16 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0.0f; #if defined(__VXE__) || defined(__VXE2__) + float32x4_t v_sum0 = vec_splats(0.0f); + float32x4_t v_sum1 = vec_splats(0.0f); + + uint32_t qh0, qh1; + uint64_t tmp0[4], tmp1[4]; + const uint8x16_t v_m = vec_splats((uint8_t)0x0F); const uint8x16_t v_kperm = (const uint8x16_t){ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }; - float32x4_t v_sum0 = vec_splats(0.0f); - float32x4_t v_sum1 = vec_splats(0.0f); - #pragma GCC unroll 8 for (; ib + 1 < nb; ib += 2) { const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0]; @@ -289,15 +292,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; - uint32_t qh0, qh1; memcpy(&qh0, x0->qh, sizeof(qh0)); memcpy(&qh1, x1->qh, sizeof(qh1)); - uint64_t tmp0[4], tmp1[4]; tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; tmp0[3] = table_b2b_1[(qh0 >> 24) ]; + tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; @@ -308,34 +310,35 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0)); int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2)); + // required for fixing the byteorder v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm); v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm); v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm); v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm); - uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs); - uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs); + const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs); + const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs); - uint8x16_t v_x0l = vec_and(v_x0, v_m); - uint8x16_t v_x0h = vec_sr(v_x0, vec_splats((uint8_t)0x04)); - uint8x16_t v_x1l = vec_and(v_x1, v_m); - uint8x16_t v_x1h = vec_sr(v_x1, vec_splats((uint8_t)0x04)); + int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m); + int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4); + int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m); + int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4); - int8x16_t v_x0lf = vec_sub((int8x16_t)v_x0l, v_qh0l); - int8x16_t v_x0hf = vec_sub((int8x16_t)v_x0h, v_qh0h); - int8x16_t v_x1lf = vec_sub((int8x16_t)v_x1l, v_qh1l); - int8x16_t v_x1hf = vec_sub((int8x16_t)v_x1h, v_qh1h); + const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l); + const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h); + const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l); + const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h); - int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs); - int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs); - int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs); - int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs); + const int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs); + const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs); + const int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs); + const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs); - int32x4_t v_sums0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); - int32x4_t v_sums1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); + int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); + int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); - float32x4_t v_sums0f = vec_float(v_sums0); - float32x4_t v_sums1f = vec_float(v_sums1); + const float32x4_t v_xy0f = vec_float(v_xy0); + const float32x4_t v_xy1f = vec_float(v_xy1); const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d)); @@ -344,14 +347,15 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi v_sum1 = vec_madd(v_sums1f, v_d1, v_sum1); } - float32x4_t v_sumv = vec_add(v_sum0, v_sum1); - float32x4_t v_temp = v_sumv + vec_reve(v_sumv); // Optimised hsum - sumf += v_temp[0] + v_temp[1]; + sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1); #pragma GCC unroll 8 for (; ib < nb; ++ib) { + const block_q5_0 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; + uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); + memcpy(&qh, x0->qh, sizeof(qh)); uint64_t tmp[4]; tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; @@ -361,27 +365,28 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0)); int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2)); + + // required for fixing the byteorder v_qhl = vec_perm(v_qhl, v_qhl, v_kperm); v_qhh = vec_perm(v_qhh, v_qhh, v_kperm); - uint8x16_t v_x = vec_xl(0, (const uint8_t *)x[ib].qs); - uint8x16_t v_xl = vec_and(v_x, v_m); - uint8x16_t v_xh = vec_sr(v_x, vec_splats((uint8_t)0x04)); + const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs); + int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); + int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4); - int8x16_t v_xlf = vec_sub((int8x16_t)v_xl, v_qhl); - int8x16_t v_xhf = vec_sub((int8x16_t)v_xh, v_qhh); + const int8x16_t v_xlf = vec_sub(v_xl, v_qhl); + const int8x16_t v_xhf = vec_sub(v_xh, v_qhh); - int8x16_t v_yl = vec_xl(0, (const int8_t *)y[ib].qs); - int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y[ib].qs); + const int8x16_t v_yl = vec_xl(0, (const int8_t *)y0->qs); + const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs); - int32x4_t v_sums = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh); - float32x4_t v_sumsf = vec_float(v_sums); + const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh); + const float32x4_t v_xyf = vec_float(v_xy); - const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); - float32x4_t acc = vec_madd(v_sumsf, v_d, vec_splats(0.0f)); + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); + const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f)); - float32x4_t v_temp = acc + vec_reve(acc); // Optimised hsum - sumf += v_temp[0] + v_temp[1]; + sumf += vec_hsum(v_acc); } *s = sumf; @@ -394,6 +399,159 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0.0f; + +#if defined(__VXE__) || defined(__VXE2__) + float32x4_t v_sums0 = vec_splats(0.0f); + float32x4_t v_sums1 = vec_splats(0.0f); + + float summs0 = 0.0f, summs1 = 0.0f; + + uint32_t qh0, qh1; + uint64_t tmp0[4], tmp1[4]; + + uint8x16_t v_kperm = (const uint8x16_t){ + 7, 6, 5, 4, 3, 2, 1, 0, + 15, 14, 13, 12, 11, 10, 9, 8 + }; + + for (; ib + 1 < nb; ib += 2) { + const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; + + const uint8x16_t v_m = vec_splats((uint8_t)0x0F); + + summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); + summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s); + + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_0[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_0[(qh1 >> 24) ]; + + int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0)); + int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2)); + int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0)); + int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2)); + + v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm); + v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm); + v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm); + v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm); + + const uint8x16_t v_x0 = vec_xl(0, x0->qs); + const uint8x16_t v_x1 = vec_xl(0, x1->qs); + + const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m); + const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4); + const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m); + const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4); + + const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l); + const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h); + const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l); + const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h); + + const int8x16_t v_y0l = vec_xl(0, y0->qs); + const int8x16_t v_y0h = vec_xl(16, y0->qs); + const int8x16_t v_y1l = vec_xl(0, y1->qs); + const int8x16_t v_y1h = vec_xl(16, y1->qs); + + int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); + int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); + + float32x4_t v_xy0f = vec_float(v_xy0); + float32x4_t v_xy1f = vec_float(v_xy1); + + const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); + const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d)); + + v_sums0 = vec_madd(v_xy0f, v_d0, v_sums0); + v_sums1 = vec_madd(v_xy1f, v_d1, v_sums1); + } + + float32x4_t v_sumv = vec_add(v_sums0, v_sums1); + sumf += v_sumv[0] + v_sumv[1] + v_sumv[2] + v_sumv[3] + summs0 + summs1; + + for (; ib < nb; ++ib) { + const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; + + const uint8x16_t v_m = vec_splats((uint8_t)0x0F); + + float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); + + uint32_t qh; + memcpy(&qh, x0->qh, sizeof(qh)); + + uint64_t tmp[4]; + tmp[0] = table_b2b_0[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_0[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_0[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_0[(qh >> 24) ]; + + int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0)); + int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2)); + + v_qhl = vec_perm(v_qhl, v_qhl, v_kperm); + v_qhh = vec_perm(v_qhh, v_qhh, v_kperm); + + const uint8x16_t v_x = vec_xl(0, x0->qs); + const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); + const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4); + + const int8x16_t v_xlf = vec_or(v_xl, v_qhl); + const int8x16_t v_xhf = vec_or(v_xh, v_qhh); + + const int8x16_t v_yl = vec_xl(0, y0->qs); + const int8x16_t v_yh = vec_xl(16, y0->qs); + + int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh); + float32x4_t v_xyf = vec_float(v_xy); + + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); + + float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc); + sumf += v_acc[0] + v_acc[1] + v_acc[2] + v_acc[3] + summs; + } + + *s = sumf; +#else + UNUSED(nb); + UNUSED(x); + UNUSED(y); + UNUSED(ib); + UNUSED(sumf); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); +#endif +} + void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index d839cf5c55e81..2c05985068a0a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -486,6 +486,14 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) { return v_abo + v_abe; } +/** + * @see https://github.com/ggml-org/llama.cpp/pull/14037 + */ +inline float32x4_t vec_hsum(float32x4_t v) { + float32x4_t v_temp = v + vec_reve(v); + return v_temp[0] + v_temp[1]; +} + inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) { const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b); return acc + (vec_unpackh(p) + vec_unpackl(p)); From 5cdac4691c78d5e6b6bd9c4004a0b9a66655a072 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 22 Aug 2025 01:33:59 +0800 Subject: [PATCH 05/12] ggml-cpu: fix incorrect return type vec_hsum Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/ggml-cpu-impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index 2c05985068a0a..1f6844e16cd34 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -489,7 +489,7 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) { /** * @see https://github.com/ggml-org/llama.cpp/pull/14037 */ -inline float32x4_t vec_hsum(float32x4_t v) { +inline float vec_hsum(float32x4_t v) { float32x4_t v_temp = v + vec_reve(v); return v_temp[0] + v_temp[1]; } From 330a2a5d6dcd8f9200ea0f7e1fffcc4bf7f43627 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 22 Aug 2025 01:35:31 +0800 Subject: [PATCH 06/12] ggml-cpu: q5_0 incomplete refactor + table_b2b_0 activation Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/arch/s390/quants.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 60484857ec344..dc7cc3d73c03e 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -34,7 +34,7 @@ #define B8(c,s ) B7(c,s, c), B7(c,s, s) // precomputed tables for expanding 8bits to 8 bytes: -// static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4 +static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4 static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif @@ -343,8 +343,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d)); - v_sum0 = vec_madd(v_sums0f, v_d0, v_sum0); - v_sum1 = vec_madd(v_sums1f, v_d1, v_sum1); + v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0); + v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1); } sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1); From 4a72780b76e4eb6a1db7c435650f1f4b257fd95c Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 22 Aug 2025 01:55:01 +0800 Subject: [PATCH 07/12] ggml-cpu: refactor q5_1 Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/arch/s390/quants.c | 72 ++++++++++++++-------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index dc7cc3d73c03e..e1f1e4e2b845f 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -36,6 +36,12 @@ // precomputed tables for expanding 8bits to 8 bytes: static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4 static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 + +// permute mask for byteswapping +static const uint8x16_t v_kperm = (const uint8x16_t){ + 7, 6, 5, 4, 3, 2, 1, 0, + 15, 14, 13, 12, 11, 10, 9, 8 +}; #endif void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { @@ -282,8 +288,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi uint64_t tmp0[4], tmp1[4]; const uint8x16_t v_m = vec_splats((uint8_t)0x0F); - const uint8x16_t v_kperm = (const uint8x16_t){ 7, 6, 5, 4, 3, 2, 1, 0, - 15, 14, 13, 12, 11, 10, 9, 8 }; #pragma GCC unroll 8 for (; ib + 1 < nb; ib += 2) { @@ -334,8 +338,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs); const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs); - int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); - int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); + const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); + const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); const float32x4_t v_xy0f = vec_float(v_xy0); const float32x4_t v_xy1f = vec_float(v_xy1); @@ -418,18 +422,19 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0.0f; #if defined(__VXE__) || defined(__VXE2__) - float32x4_t v_sums0 = vec_splats(0.0f); - float32x4_t v_sums1 = vec_splats(0.0f); + float32x4_t v_sum0 = vec_splats(0.0f); + float32x4_t v_sum1 = vec_splats(0.0f); - float summs0 = 0.0f, summs1 = 0.0f; + float summs0 = 0.0f; + float summs1 = 0.0f; - uint32_t qh0, qh1; - uint64_t tmp0[4], tmp1[4]; + uint32_t qh0; + uint32_t qh1; - uint8x16_t v_kperm = (const uint8x16_t){ - 7, 6, 5, 4, 3, 2, 1, 0, - 15, 14, 13, 12, 11, 10, 9, 8 - }; + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + const uint8x16_t v_m = vec_splats((uint8_t)0x0F); for (; ib + 1 < nb; ib += 2) { const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0]; @@ -437,8 +442,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0]; const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; - const uint8x16_t v_m = vec_splats((uint8_t)0x0F); - summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s); @@ -460,6 +463,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0)); int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2)); + // required for fixing the byteorder v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm); v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm); v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm); @@ -478,33 +482,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l); const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h); - const int8x16_t v_y0l = vec_xl(0, y0->qs); - const int8x16_t v_y0h = vec_xl(16, y0->qs); - const int8x16_t v_y1l = vec_xl(0, y1->qs); - const int8x16_t v_y1h = vec_xl(16, y1->qs); + const int8x16_t v_y0l = vec_xl(0 , y0->qs); + const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs); + const int8x16_t v_y1l = vec_xl(0 , y1->qs); + const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs); - int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); - int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); + const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); + const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); - float32x4_t v_xy0f = vec_float(v_xy0); - float32x4_t v_xy1f = vec_float(v_xy1); + const float32x4_t v_xy0f = vec_float(v_xy0); + const float32x4_t v_xy1f = vec_float(v_xy1); const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d)); - v_sums0 = vec_madd(v_xy0f, v_d0, v_sums0); - v_sums1 = vec_madd(v_xy1f, v_d1, v_sums1); + v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0); + v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1); } - float32x4_t v_sumv = vec_add(v_sums0, v_sums1); - sumf += v_sumv[0] + v_sumv[1] + v_sumv[2] + v_sumv[3] + summs0 + summs1; + sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1; for (; ib < nb; ++ib) { const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; - const uint8x16_t v_m = vec_splats((uint8_t)0x0F); - float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); uint32_t qh; @@ -519,6 +520,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0)); int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2)); + // required for fixing the byteorder v_qhl = vec_perm(v_qhl, v_qhl, v_kperm); v_qhh = vec_perm(v_qhh, v_qhh, v_kperm); @@ -529,16 +531,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const int8x16_t v_xlf = vec_or(v_xl, v_qhl); const int8x16_t v_xhf = vec_or(v_xh, v_qhh); - const int8x16_t v_yl = vec_xl(0, y0->qs); - const int8x16_t v_yh = vec_xl(16, y0->qs); + const int8x16_t v_yl = vec_xl(0 , y0->qs); + const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs); - int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh); - float32x4_t v_xyf = vec_float(v_xy); + const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh); + const float32x4_t v_xyf = vec_float(v_xy); const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); + const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc); - float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc); - sumf += v_acc[0] + v_acc[1] + v_acc[2] + v_acc[3] + summs; + sumf += vec_hsum(v_acc) + summs; } *s = sumf; From 5a94a01a6c4ee2d848246cf028241668fc8ea4ea Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 22 Aug 2025 02:01:49 +0800 Subject: [PATCH 08/12] ggml-cpu: q5_1 update loop unroll to 4 Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/arch/s390/quants.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index e1f1e4e2b845f..af4f44dfdb3a3 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -436,6 +436,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8x16_t v_m = vec_splats((uint8_t)0x0F); + #pragma GCC unroll 4 for (; ib + 1 < nb; ib += 2) { const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0]; const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1]; @@ -502,6 +503,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1; + #pragma GCC unroll 4 for (; ib < nb; ++ib) { const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; From fd8f4a2d0599ff69d49106d8fc2b3eb3e7b7862b Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 22 Aug 2025 02:05:47 +0800 Subject: [PATCH 09/12] ggml-cpu: update q5_0 unroll to 4 Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/arch/s390/quants.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index af4f44dfdb3a3..2901cf9200372 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -289,7 +289,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8x16_t v_m = vec_splats((uint8_t)0x0F); - #pragma GCC unroll 8 + #pragma GCC unroll 4 for (; ib + 1 < nb; ib += 2) { const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0]; const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1]; @@ -353,7 +353,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1); - #pragma GCC unroll 8 + #pragma GCC unroll 4 for (; ib < nb; ++ib) { const block_q5_0 * GGML_RESTRICT x0 = &x[ib]; const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; From 3815dea4354de0e2c7ad3cd53271b1e9bf1844ec Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 22 Aug 2025 02:13:08 +0800 Subject: [PATCH 10/12] ggml-cpu: update build-s390x docs Signed-off-by: Aaron Teo --- docs/build-s390x.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/build-s390x.md b/docs/build-s390x.md index b36a1998144a1..9c93885eb4f67 100644 --- a/docs/build-s390x.md +++ b/docs/build-s390x.md @@ -265,8 +265,9 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl | BF16 | 🚫 | 🚫 | ❓ | ❓ | | Q4_0 | ✅ | ✅ | ❓ | ❓ | | Q4_1 | ✅ | ✅ | ❓ | ❓ | -| Q5_0 | 🚫 | 🚫 | ❓ | ❓ | -| Q5_1 | 🚫 | 🚫 | ❓ | ❓ | +| MXFP4 | 🚫 | 🚫 | ❓ | ❓ | +| Q5_0 | ✅ | ✅ | ❓ | ❓ | +| Q5_1 | ✅ | ✅ | ❓ | ❓ | | Q8_0 | ✅ | ✅ | ❓ | ❓ | | Q2_K | 🚫 | 🚫 | ❓ | ❓ | | Q3_K | ✅ | ✅ | ❓ | ❓ | From 46284a0266507d4729ceaaa406855db1d02616bf Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 22 Aug 2025 02:15:58 +0800 Subject: [PATCH 11/12] ggml-cpu: update unused variables q5_0 Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/arch/s390/quants.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 2901cf9200372..1c8176fb4d91f 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -395,9 +395,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; #else - UNUSED(ib); + UNUSED(nb); UNUSED(x); UNUSED(y); + UNUSED(ib); UNUSED(sumf); ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); #endif From 9969fcb0cbd6deba330efdda3bcccef4baae44a3 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 22 Aug 2025 13:56:54 +0800 Subject: [PATCH 12/12] docs: update the last update date Signed-off-by: Aaron Teo --- docs/build-s390x.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/build-s390x.md b/docs/build-s390x.md index 9c93885eb4f67..f3cdd63be3ece 100644 --- a/docs/build-s390x.md +++ b/docs/build-s390x.md @@ -292,4 +292,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl - 🚫 - acceleration unavailable, will still run using scalar implementation - ❓ - acceleration unknown, please contribute if you can test it yourself -Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 31, 2025. +Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Aug 22, 2025.