From e5a946943b5999796fe0574489e2549560ca7448 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Thu, 21 Aug 2025 01:08:53 +0800
Subject: [PATCH 01/12] ggml-cpu: initial q5_0 impl for s390x

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/arch-fallback.h    |  1 -
 ggml/src/ggml-cpu/arch/s390/quants.c | 96 ++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h
index 0bfb92df17909..1e15249a294ce 100644
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -150,7 +150,6 @@
 #elif defined(__s390x__)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
-#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
 #define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c
index 7e4229d0e46a9..9df491e17b139 100644
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -241,6 +241,102 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
 #endif
 }
 
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0.0f;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    float32x4_t acc = vec_splats(0.0f);
+
+    const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
+    const uint16x8_t v_ml = { 1, 2, 4, 8, 16, 32, 64, 128 };
+    const uint16x8_t v_mh = { 256, 512, 1024, 2048, 4096, 8192, 16384, 32768 };
+    const uint16x8_t v_z = vec_splats((uint16_t)0);
+
+    #pragma GCC unroll 8
+    for (; ib < nb; ++ib) {
+        // Load 32-bit high flags (5th bit) into a 32-bit integer
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        const uint16_t qh_e = qh & 0xFFFF;
+        const uint16_t qh_o = qh >> 16;
+
+        const uint16x8_t v_qhe = vec_splats(qh_e);
+        const uint16x8_t v_qhel = vec_and(v_qhe, v_ml);
+        const uint16x8_t v_qheh = vec_and(v_qhe, v_mh);
+        const uint16x8_t v_mel = vec_cmpeq(v_qhel, v_z);
+        const uint16x8_t v_meh = vec_cmpeq(v_qheh, v_z);
+
+        const uint16x8_t v_cel = vec_sr(v_mel, vec_splats((uint16_t)15));
+        const uint16x8_t v_ceh = vec_sr(v_meh, vec_splats((uint16_t)15));
+
+        const uint16x8_t v_subel = vec_mul(v_cel, vec_splats((uint16_t)0x10));
+        const uint16x8_t v_subeh = vec_mul(v_ceh, vec_splats((uint16_t)0x10));
+
+        const uint8x16_t v_qhep = vec_pack(v_subel, v_subeh);
+
+        const uint16x8_t v_qho = vec_splats(qh_o);
+        const uint16x8_t v_qhol = vec_and(v_qho, v_ml);
+        const uint16x8_t v_qhoh = vec_and(v_qho, v_mh);
+        const uint16x8_t v_mol = vec_cmpeq(v_qhol, v_z);
+        const uint16x8_t v_moh = vec_cmpeq(v_qhoh, v_z);
+
+        const uint16x8_t v_col = vec_sr(v_mol, vec_splats((uint16_t)15));
+        const uint16x8_t v_coh = vec_sr(v_moh, vec_splats((uint16_t)15));
+
+        const uint16x8_t v_subol = vec_mul(v_col, vec_splats((uint16_t)0x10));
+        const uint16x8_t v_suboh = vec_mul(v_coh, vec_splats((uint16_t)0x10));
+
+        const uint8x16_t v_qhop = vec_pack(v_subol, v_suboh);
+
+
+        const uint8x16_t v_x = vec_xl(0, x[ib].qs);
+        const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
+        const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, vec_splats((uint8_t)4));
+
+        const int8x16_t v_xlf = vec_sub(v_xl, (int8x16_t)v_qhep);
+        const int8x16_t v_xhf = vec_sub(v_xh, (int8x16_t)v_qhop);
+
+        const int8x16_t v_yl = vec_xl(0, y[ib].qs);
+        const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
+
+        const int32x4_t v_xy_ = ggml_vec_dot(
+                ggml_vec_dot(vec_splats(0), v_xlf, v_yl),
+                v_xhf, v_yh);
+        const float32x4_t v_xy = vec_float(v_xy_);
+
+        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) *
+                                            GGML_CPU_FP16_TO_FP32(y[ib].d));
+
+        acc = vec_madd(v_xy, v_d, acc);
+    }
+
+    sumf = acc[0] + acc[1] + acc[2] + acc[3];
+    *s = sumf;
+#else
+    UNUSED(ib);
+    UNUSED(x);
+    UNUSED(y);
+    UNUSED(sumf);
+    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     const int qk = QK8_0;
     const int nb = n / qk;

From 15067372e9a99e173588d199e9690ad2d251cfb0 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Thu, 21 Aug 2025 14:46:44 +0800
Subject: [PATCH 02/12] ggml-cpu: updated q5_0 code for better performance

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/arch/s390/quants.c | 157 ++++++++++++++++++---------
 1 file changed, 106 insertions(+), 51 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c
index 9df491e17b139..180367cc0b95c 100644
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -23,6 +23,21 @@
 
 #define UNUSED GGML_UNUSED
 
+#if defined(__VXE__) || defined(__VXE2__)
+#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
+#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
+#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
+#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
+#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
+#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
+#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
+#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
+
+// precomputed tables for expanding 8bits to 8 bytes:
+// static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10)  }; // ( b ) << 4
+static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
+#endif
+
 void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
@@ -260,73 +275,113 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     float sumf = 0.0f;
 
 #if defined(__VXE__) || defined(__VXE2__)
-    float32x4_t acc = vec_splats(0.0f);
-
     const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
-    const uint16x8_t v_ml = { 1, 2, 4, 8, 16, 32, 64, 128 };
-    const uint16x8_t v_mh = { 256, 512, 1024, 2048, 4096, 8192, 16384, 32768 };
-    const uint16x8_t v_z = vec_splats((uint16_t)0);
+    const uint8x16_t v_kperm = (const uint8x16_t){  7,  6,  5,  4,  3,  2, 1, 0,
+                                                   15, 14, 13, 12, 11, 10, 9, 8  };
+
+    float32x4_t v_sum0 = vec_splats(0.0f);
+    float32x4_t v_sum1 = vec_splats(0.0f);
+
+    #pragma GCC unroll 8
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0];
+        const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
+
+        uint32_t qh0, qh1;
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        uint64_t tmp0[4], tmp1[4];
+        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
+        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
+
+        int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
+        int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
+        int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
+        int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
+
+        v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
+        v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
+        v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
+        v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
+
+        uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs);
+        uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs);
+
+        uint8x16_t v_x0l = vec_and(v_x0, v_m);
+        uint8x16_t v_x0h = vec_sr(v_x0, vec_splats((uint8_t)0x04));
+        uint8x16_t v_x1l = vec_and(v_x1, v_m);
+        uint8x16_t v_x1h = vec_sr(v_x1, vec_splats((uint8_t)0x04));
+
+        int8x16_t v_x0lf = vec_sub((int8x16_t)v_x0l, v_qh0l);
+        int8x16_t v_x0hf = vec_sub((int8x16_t)v_x0h, v_qh0h);
+        int8x16_t v_x1lf = vec_sub((int8x16_t)v_x1l, v_qh1l);
+        int8x16_t v_x1hf = vec_sub((int8x16_t)v_x1h, v_qh1h);
+
+        int8x16_t v_y0l = vec_xl(0,       (const int8_t *)y0->qs);
+        int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
+        int8x16_t v_y1l = vec_xl(0,       (const int8_t *)y1->qs);
+        int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
+
+        int32x4_t v_sums0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
+        int32x4_t v_sums1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
+
+        float32x4_t v_sums0f = vec_float(v_sums0);
+        float32x4_t v_sums1f = vec_float(v_sums1);
+
+        const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
+        const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
+
+        v_sum0 = vec_madd(v_sums0f, v_d0, v_sum0);
+        v_sum1 = vec_madd(v_sums1f, v_d1, v_sum1);
+    }
+
+    float32x4_t v_sumv = vec_add(v_sum0, v_sum1);
+    sumf += v_sumv[0] + v_sumv[1] + v_sumv[2] + v_sumv[3];
 
     #pragma GCC unroll 8
     for (; ib < nb; ++ib) {
-        // Load 32-bit high flags (5th bit) into a 32-bit integer
         uint32_t qh;
         memcpy(&qh, x[ib].qh, sizeof(qh));
 
-        const uint16_t qh_e = qh & 0xFFFF;
-        const uint16_t qh_o = qh >> 16;
-
-        const uint16x8_t v_qhe = vec_splats(qh_e);
-        const uint16x8_t v_qhel = vec_and(v_qhe, v_ml);
-        const uint16x8_t v_qheh = vec_and(v_qhe, v_mh);
-        const uint16x8_t v_mel = vec_cmpeq(v_qhel, v_z);
-        const uint16x8_t v_meh = vec_cmpeq(v_qheh, v_z);
-
-        const uint16x8_t v_cel = vec_sr(v_mel, vec_splats((uint16_t)15));
-        const uint16x8_t v_ceh = vec_sr(v_meh, vec_splats((uint16_t)15));
+        uint64_t tmp[4];
+        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_1[(qh >> 24)       ];
 
-        const uint16x8_t v_subel = vec_mul(v_cel, vec_splats((uint16_t)0x10));
-        const uint16x8_t v_subeh = vec_mul(v_ceh, vec_splats((uint16_t)0x10));
+        int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
+        int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
+        v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
+        v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
 
-        const uint8x16_t v_qhep = vec_pack(v_subel, v_subeh);
+        uint8x16_t v_x = vec_xl(0, (const uint8_t *)x[ib].qs);
+        uint8x16_t v_xl = vec_and(v_x, v_m);
+        uint8x16_t v_xh = vec_sr(v_x, vec_splats((uint8_t)0x04));
 
-        const uint16x8_t v_qho = vec_splats(qh_o);
-        const uint16x8_t v_qhol = vec_and(v_qho, v_ml);
-        const uint16x8_t v_qhoh = vec_and(v_qho, v_mh);
-        const uint16x8_t v_mol = vec_cmpeq(v_qhol, v_z);
-        const uint16x8_t v_moh = vec_cmpeq(v_qhoh, v_z);
+        int8x16_t v_xlf = vec_sub((int8x16_t)v_xl, v_qhl);
+        int8x16_t v_xhf = vec_sub((int8x16_t)v_xh, v_qhh);
 
-        const uint16x8_t v_col = vec_sr(v_mol, vec_splats((uint16_t)15));
-        const uint16x8_t v_coh = vec_sr(v_moh, vec_splats((uint16_t)15));
+        int8x16_t v_yl = vec_xl(0,       (const int8_t *)y[ib].qs);
+        int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y[ib].qs);
 
-        const uint16x8_t v_subol = vec_mul(v_col, vec_splats((uint16_t)0x10));
-        const uint16x8_t v_suboh = vec_mul(v_coh, vec_splats((uint16_t)0x10));
+        int32x4_t v_sums = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
+        float32x4_t v_sumsf = vec_float(v_sums);
 
-        const uint8x16_t v_qhop = vec_pack(v_subol, v_suboh);
-
-
-        const uint8x16_t v_x = vec_xl(0, x[ib].qs);
-        const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
-        const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, vec_splats((uint8_t)4));
-
-        const int8x16_t v_xlf = vec_sub(v_xl, (int8x16_t)v_qhep);
-        const int8x16_t v_xhf = vec_sub(v_xh, (int8x16_t)v_qhop);
-
-        const int8x16_t v_yl = vec_xl(0, y[ib].qs);
-        const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
-
-        const int32x4_t v_xy_ = ggml_vec_dot(
-                ggml_vec_dot(vec_splats(0), v_xlf, v_yl),
-                v_xhf, v_yh);
-        const float32x4_t v_xy = vec_float(v_xy_);
-
-        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) *
-                                            GGML_CPU_FP16_TO_FP32(y[ib].d));
+        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+        float32x4_t acc = vec_madd(v_sumsf, v_d, vec_splats(0.0f));
 
-        acc = vec_madd(v_xy, v_d, acc);
+        sumf += acc[0] + acc[1] + acc[2] + acc[3];
     }
 
-    sumf = acc[0] + acc[1] + acc[2] + acc[3];
     *s = sumf;
 #else
     UNUSED(ib);

From d02fbd8edee4af56330bbc88d3b5763db81657ed Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Thu, 21 Aug 2025 15:01:25 +0800
Subject: [PATCH 03/12] ggml-cpu: use optimised hsum for better performance

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/arch/s390/quants.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c
index 180367cc0b95c..d01e54fd2efac 100644
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -345,7 +345,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     }
 
     float32x4_t v_sumv = vec_add(v_sum0, v_sum1);
-    sumf += v_sumv[0] + v_sumv[1] + v_sumv[2] + v_sumv[3];
+    float32x4_t v_temp = v_sumv + vec_reve(v_sumv);  // Optimised hsum
+    sumf += v_temp[0] + v_temp[1];
 
     #pragma GCC unroll 8
     for (; ib < nb; ++ib) {
@@ -379,7 +380,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
         float32x4_t acc = vec_madd(v_sumsf, v_d, vec_splats(0.0f));
 
-        sumf += acc[0] + acc[1] + acc[2] + acc[3];
+        float32x4_t v_temp = acc + vec_reve(acc);  // Optimised hsum
+        sumf += v_temp[0] + v_temp[1];
     }
 
     *s = sumf;

From dd6deeffa0fed9435f8713e11280d9a216718cdf Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 22 Aug 2025 01:31:28 +0800
Subject: [PATCH 04/12] ggml-cpu: introduce q5_1 simd + refactor q5_0

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/arch-fallback.h    |   1 -
 ggml/src/ggml-cpu/arch/s390/quants.c | 238 ++++++++++++++++++++++-----
 ggml/src/ggml-cpu/ggml-cpu-impl.h    |   8 +
 3 files changed, 206 insertions(+), 41 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h
index 1e15249a294ce..373408a9c0955 100644
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -150,7 +150,6 @@
 #elif defined(__s390x__)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
-#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
 #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
 #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c
index d01e54fd2efac..60484857ec344 100644
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -275,13 +275,16 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     float sumf = 0.0f;
 
 #if defined(__VXE__) || defined(__VXE2__)
+    float32x4_t v_sum0 = vec_splats(0.0f);
+    float32x4_t v_sum1 = vec_splats(0.0f);
+
+    uint32_t qh0, qh1;
+    uint64_t tmp0[4], tmp1[4];
+
     const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
     const uint8x16_t v_kperm = (const uint8x16_t){  7,  6,  5,  4,  3,  2, 1, 0,
                                                    15, 14, 13, 12, 11, 10, 9, 8  };
 
-    float32x4_t v_sum0 = vec_splats(0.0f);
-    float32x4_t v_sum1 = vec_splats(0.0f);
-
     #pragma GCC unroll 8
     for (; ib + 1 < nb; ib += 2) {
         const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0];
@@ -289,15 +292,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
         const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 
-        uint32_t qh0, qh1;
         memcpy(&qh0, x0->qh, sizeof(qh0));
         memcpy(&qh1, x1->qh, sizeof(qh1));
 
-        uint64_t tmp0[4], tmp1[4];
         tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
         tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
         tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
         tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
+
         tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
         tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
         tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
@@ -308,34 +310,35 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
         int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
         int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
 
+        // required for fixing the byteorder
         v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
         v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
         v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
         v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
 
-        uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs);
-        uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs);
+        const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs);
+        const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs);
 
-        uint8x16_t v_x0l = vec_and(v_x0, v_m);
-        uint8x16_t v_x0h = vec_sr(v_x0, vec_splats((uint8_t)0x04));
-        uint8x16_t v_x1l = vec_and(v_x1, v_m);
-        uint8x16_t v_x1h = vec_sr(v_x1, vec_splats((uint8_t)0x04));
+        int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
+        int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
+        int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
+        int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
 
-        int8x16_t v_x0lf = vec_sub((int8x16_t)v_x0l, v_qh0l);
-        int8x16_t v_x0hf = vec_sub((int8x16_t)v_x0h, v_qh0h);
-        int8x16_t v_x1lf = vec_sub((int8x16_t)v_x1l, v_qh1l);
-        int8x16_t v_x1hf = vec_sub((int8x16_t)v_x1h, v_qh1h);
+        const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l);
+        const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h);
+        const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l);
+        const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h);
 
-        int8x16_t v_y0l = vec_xl(0,       (const int8_t *)y0->qs);
-        int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
-        int8x16_t v_y1l = vec_xl(0,       (const int8_t *)y1->qs);
-        int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
+        const int8x16_t v_y0l = vec_xl(0,       (const int8_t *)y0->qs);
+        const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
+        const int8x16_t v_y1l = vec_xl(0,       (const int8_t *)y1->qs);
+        const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
 
-        int32x4_t v_sums0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
-        int32x4_t v_sums1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
+        int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
+        int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
 
-        float32x4_t v_sums0f = vec_float(v_sums0);
-        float32x4_t v_sums1f = vec_float(v_sums1);
+        const float32x4_t v_xy0f = vec_float(v_xy0);
+        const float32x4_t v_xy1f = vec_float(v_xy1);
 
         const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
         const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
@@ -344,14 +347,15 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
         v_sum1 = vec_madd(v_sums1f, v_d1, v_sum1);
     }
 
-    float32x4_t v_sumv = vec_add(v_sum0, v_sum1);
-    float32x4_t v_temp = v_sumv + vec_reve(v_sumv);  // Optimised hsum
-    sumf += v_temp[0] + v_temp[1];
+    sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
 
     #pragma GCC unroll 8
     for (; ib < nb; ++ib) {
+        const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
+
         uint32_t qh;
-        memcpy(&qh, x[ib].qh, sizeof(qh));
+        memcpy(&qh, x0->qh, sizeof(qh));
 
         uint64_t tmp[4];
         tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
@@ -361,27 +365,28 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
         int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
         int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
+
+        // required for fixing the byteorder
         v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
         v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
 
-        uint8x16_t v_x = vec_xl(0, (const uint8_t *)x[ib].qs);
-        uint8x16_t v_xl = vec_and(v_x, v_m);
-        uint8x16_t v_xh = vec_sr(v_x, vec_splats((uint8_t)0x04));
+        const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs);
+        int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
+        int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
 
-        int8x16_t v_xlf = vec_sub((int8x16_t)v_xl, v_qhl);
-        int8x16_t v_xhf = vec_sub((int8x16_t)v_xh, v_qhh);
+        const int8x16_t v_xlf = vec_sub(v_xl, v_qhl);
+        const int8x16_t v_xhf = vec_sub(v_xh, v_qhh);
 
-        int8x16_t v_yl = vec_xl(0,       (const int8_t *)y[ib].qs);
-        int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y[ib].qs);
+        const int8x16_t v_yl = vec_xl(0,       (const int8_t *)y0->qs);
+        const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
 
-        int32x4_t v_sums = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
-        float32x4_t v_sumsf = vec_float(v_sums);
+        const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
+        const float32x4_t v_xyf = vec_float(v_xy);
 
-        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
-        float32x4_t acc = vec_madd(v_sumsf, v_d, vec_splats(0.0f));
+        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
+        const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
 
-        float32x4_t v_temp = acc + vec_reve(acc);  // Optimised hsum
-        sumf += v_temp[0] + v_temp[1];
+        sumf += vec_hsum(v_acc);
     }
 
     *s = sumf;
@@ -394,6 +399,159 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 #endif
 }
 
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0.0f;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    float32x4_t v_sums0 = vec_splats(0.0f);
+    float32x4_t v_sums1 = vec_splats(0.0f);
+
+    float summs0 = 0.0f, summs1 = 0.0f;
+
+    uint32_t qh0, qh1;
+    uint64_t tmp0[4], tmp1[4];
+
+    uint8x16_t v_kperm = (const uint8x16_t){
+        7, 6, 5, 4, 3, 2, 1, 0,
+        15, 14, 13, 12, 11, 10, 9, 8
+    };
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0];
+        const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
+        const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
+
+        const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
+
+        summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
+        summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
+
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
+
+        int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
+        int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
+        int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
+        int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
+
+        v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
+        v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
+        v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
+        v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
+
+        const uint8x16_t v_x0 = vec_xl(0, x0->qs);
+        const uint8x16_t v_x1 = vec_xl(0, x1->qs);
+
+        const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
+        const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
+        const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
+        const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
+
+        const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l);
+        const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h);
+        const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l);
+        const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h);
+
+        const int8x16_t v_y0l = vec_xl(0, y0->qs);
+        const int8x16_t v_y0h = vec_xl(16, y0->qs);
+        const int8x16_t v_y1l = vec_xl(0, y1->qs);
+        const int8x16_t v_y1h = vec_xl(16, y1->qs);
+
+        int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
+        int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
+
+        float32x4_t v_xy0f = vec_float(v_xy0);
+        float32x4_t v_xy1f = vec_float(v_xy1);
+
+        const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
+        const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
+
+        v_sums0 = vec_madd(v_xy0f, v_d0, v_sums0);
+        v_sums1 = vec_madd(v_xy1f, v_d1, v_sums1);
+    }
+
+    float32x4_t v_sumv = vec_add(v_sums0, v_sums1);
+    sumf += v_sumv[0] + v_sumv[1] + v_sumv[2] + v_sumv[3] + summs0 + summs1;
+
+    for (; ib < nb; ++ib) {
+        const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
+        const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
+
+        const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
+
+        float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
+
+        uint32_t qh;
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        uint64_t tmp[4];
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];
+
+        int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
+        int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
+
+        v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
+        v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
+
+        const uint8x16_t v_x = vec_xl(0, x0->qs);
+        const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
+        const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
+
+        const int8x16_t v_xlf = vec_or(v_xl, v_qhl);
+        const int8x16_t v_xhf = vec_or(v_xh, v_qhh);
+
+        const int8x16_t v_yl = vec_xl(0, y0->qs);
+        const int8x16_t v_yh = vec_xl(16, y0->qs);
+
+        int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
+        float32x4_t v_xyf = vec_float(v_xy);
+
+        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
+
+        float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
+        sumf += v_acc[0] + v_acc[1] + v_acc[2] + v_acc[3] + summs;
+    }
+
+    *s = sumf;
+#else
+    UNUSED(nb);
+    UNUSED(x);
+    UNUSED(y);
+    UNUSED(ib);
+    UNUSED(sumf);
+    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     const int qk = QK8_0;
     const int nb = n / qk;
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index d839cf5c55e81..2c05985068a0a 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -486,6 +486,14 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
     return v_abo + v_abe;
 }
 
+/**
+ * @see https://github.com/ggml-org/llama.cpp/pull/14037
+ */
+inline float32x4_t vec_hsum(float32x4_t v) {
+    float32x4_t v_temp = v + vec_reve(v);
+    return v_temp[0] + v_temp[1];
+}
+
 inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
     const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
     return acc + (vec_unpackh(p) + vec_unpackl(p));

From 5cdac4691c78d5e6b6bd9c4004a0b9a66655a072 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 22 Aug 2025 01:33:59 +0800
Subject: [PATCH 05/12] ggml-cpu: fix incorrect return type vec_hsum

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/ggml-cpu-impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 2c05985068a0a..1f6844e16cd34 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -489,7 +489,7 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
 /**
  * @see https://github.com/ggml-org/llama.cpp/pull/14037
  */
-inline float32x4_t vec_hsum(float32x4_t v) {
+inline float vec_hsum(float32x4_t v) {
     float32x4_t v_temp = v + vec_reve(v);
     return v_temp[0] + v_temp[1];
 }

From 330a2a5d6dcd8f9200ea0f7e1fffcc4bf7f43627 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 22 Aug 2025 01:35:31 +0800
Subject: [PATCH 06/12] ggml-cpu: q5_0 incomplete refactor + table_b2b_0
 activation

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/arch/s390/quants.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c
index 60484857ec344..dc7cc3d73c03e 100644
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -34,7 +34,7 @@
 #define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
 
 // precomputed tables for expanding 8bits to 8 bytes:
-// static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10)  }; // ( b ) << 4
+static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4
 static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
 #endif
 
@@ -343,8 +343,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
         const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
 
-        v_sum0 = vec_madd(v_sums0f, v_d0, v_sum0);
-        v_sum1 = vec_madd(v_sums1f, v_d1, v_sum1);
+        v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
+        v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
     }
 
     sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);

From 4a72780b76e4eb6a1db7c435650f1f4b257fd95c Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 22 Aug 2025 01:55:01 +0800
Subject: [PATCH 07/12] ggml-cpu: refactor q5_1

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/arch/s390/quants.c | 72 ++++++++++++++--------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c
index dc7cc3d73c03e..e1f1e4e2b845f 100644
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -36,6 +36,12 @@
 // precomputed tables for expanding 8bits to 8 bytes:
 static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4
 static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
+
+// permute mask for byteswapping
+static const uint8x16_t v_kperm = (const uint8x16_t){
+     7,  6,  5,  4,  3,  2, 1, 0,
+    15, 14, 13, 12, 11, 10, 9, 8
+};
 #endif
 
 void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
@@ -282,8 +288,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     uint64_t tmp0[4], tmp1[4];
 
     const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
-    const uint8x16_t v_kperm = (const uint8x16_t){  7,  6,  5,  4,  3,  2, 1, 0,
-                                                   15, 14, 13, 12, 11, 10, 9, 8  };
 
     #pragma GCC unroll 8
     for (; ib + 1 < nb; ib += 2) {
@@ -334,8 +338,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const int8x16_t v_y1l = vec_xl(0,       (const int8_t *)y1->qs);
         const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
 
-        int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
-        int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
+        const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
+        const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
 
         const float32x4_t v_xy0f = vec_float(v_xy0);
         const float32x4_t v_xy1f = vec_float(v_xy1);
@@ -418,18 +422,19 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
     float sumf = 0.0f;
 
 #if defined(__VXE__) || defined(__VXE2__)
-    float32x4_t v_sums0 = vec_splats(0.0f);
-    float32x4_t v_sums1 = vec_splats(0.0f);
+    float32x4_t v_sum0 = vec_splats(0.0f);
+    float32x4_t v_sum1 = vec_splats(0.0f);
 
-    float summs0 = 0.0f, summs1 = 0.0f;
+    float summs0 = 0.0f;
+    float summs1 = 0.0f;
 
-    uint32_t qh0, qh1;
-    uint64_t tmp0[4], tmp1[4];
+    uint32_t qh0;
+    uint32_t qh1;
 
-    uint8x16_t v_kperm = (const uint8x16_t){
-        7, 6, 5, 4, 3, 2, 1, 0,
-        15, 14, 13, 12, 11, 10, 9, 8
-    };
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
 
     for (; ib + 1 < nb; ib += 2) {
         const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0];
@@ -437,8 +442,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
         const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
 
-        const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
-
         summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
         summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
 
@@ -460,6 +463,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
         int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
         int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
 
+        // required for fixing the byteorder
         v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
         v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
         v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
@@ -478,33 +482,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l);
         const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h);
 
-        const int8x16_t v_y0l = vec_xl(0, y0->qs);
-        const int8x16_t v_y0h = vec_xl(16, y0->qs);
-        const int8x16_t v_y1l = vec_xl(0, y1->qs);
-        const int8x16_t v_y1h = vec_xl(16, y1->qs);
+        const int8x16_t v_y0l = vec_xl(0      , y0->qs);
+        const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs);
+        const int8x16_t v_y1l = vec_xl(0      , y1->qs);
+        const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs);
 
-        int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
-        int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
+        const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
+        const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
 
-        float32x4_t v_xy0f = vec_float(v_xy0);
-        float32x4_t v_xy1f = vec_float(v_xy1);
+        const float32x4_t v_xy0f = vec_float(v_xy0);
+        const float32x4_t v_xy1f = vec_float(v_xy1);
 
         const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
         const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
 
-        v_sums0 = vec_madd(v_xy0f, v_d0, v_sums0);
-        v_sums1 = vec_madd(v_xy1f, v_d1, v_sums1);
+        v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
+        v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
     }
 
-    float32x4_t v_sumv = vec_add(v_sums0, v_sums1);
-    sumf += v_sumv[0] + v_sumv[1] + v_sumv[2] + v_sumv[3] + summs0 + summs1;
+    sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
 
     for (; ib < nb; ++ib) {
         const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
         const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
 
-        const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
-
         float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
 
         uint32_t qh;
@@ -519,6 +520,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
         int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
         int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
 
+        // required for fixing the byteorder
         v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
         v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
 
@@ -529,16 +531,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const int8x16_t v_xlf = vec_or(v_xl, v_qhl);
         const int8x16_t v_xhf = vec_or(v_xh, v_qhh);
 
-        const int8x16_t v_yl = vec_xl(0, y0->qs);
-        const int8x16_t v_yh = vec_xl(16, y0->qs);
+        const int8x16_t v_yl = vec_xl(0      , y0->qs);
+        const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs);
 
-        int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
-        float32x4_t v_xyf = vec_float(v_xy);
+        const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
+        const float32x4_t v_xyf = vec_float(v_xy);
 
         const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
+        const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
 
-        float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
-        sumf += v_acc[0] + v_acc[1] + v_acc[2] + v_acc[3] + summs;
+        sumf += vec_hsum(v_acc) + summs;
     }
 
     *s = sumf;

From 5a94a01a6c4ee2d848246cf028241668fc8ea4ea Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 22 Aug 2025 02:01:49 +0800
Subject: [PATCH 08/12] ggml-cpu: q5_1 update loop unroll to 4

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/arch/s390/quants.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c
index e1f1e4e2b845f..af4f44dfdb3a3 100644
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -436,6 +436,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
 
+    #pragma GCC unroll 4
     for (; ib + 1 < nb; ib += 2) {
         const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0];
         const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
@@ -502,6 +503,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
 
+    #pragma GCC unroll 4
     for (; ib < nb; ++ib) {
         const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
         const block_q8_1 * GGML_RESTRICT y0 = &y[ib];

From fd8f4a2d0599ff69d49106d8fc2b3eb3e7b7862b Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 22 Aug 2025 02:05:47 +0800
Subject: [PATCH 09/12] ggml-cpu: update q5_0 unroll to 4

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/arch/s390/quants.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c
index af4f44dfdb3a3..2901cf9200372 100644
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -289,7 +289,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
 
-    #pragma GCC unroll 8
+    #pragma GCC unroll 4
     for (; ib + 1 < nb; ib += 2) {
         const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0];
         const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
@@ -353,7 +353,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
 
-    #pragma GCC unroll 8
+    #pragma GCC unroll 4
     for (; ib < nb; ++ib) {
         const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
         const block_q8_0 * GGML_RESTRICT y0 = &y[ib];

From 3815dea4354de0e2c7ad3cd53271b1e9bf1844ec Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 22 Aug 2025 02:13:08 +0800
Subject: [PATCH 10/12] ggml-cpu: update build-s390x docs

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 docs/build-s390x.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/build-s390x.md b/docs/build-s390x.md
index b36a1998144a1..9c93885eb4f67 100644
--- a/docs/build-s390x.md
+++ b/docs/build-s390x.md
@@ -265,8 +265,9 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 | BF16       | 🚫          | 🚫   | ❓   | ❓    |
 | Q4_0       | ✅          | ✅   | ❓   | ❓    |
 | Q4_1       | ✅          | ✅   | ❓   | ❓    |
-| Q5_0       | 🚫          | 🚫   | ❓   | ❓    |
-| Q5_1       | 🚫          | 🚫   | ❓   | ❓    |
+| MXFP4      | 🚫          | 🚫   | ❓   | ❓    |
+| Q5_0       | ✅          | ✅   | ❓   | ❓    |
+| Q5_1       | ✅          | ✅   | ❓   | ❓    |
 | Q8_0       | ✅          | ✅   | ❓   | ❓    |
 | Q2_K       | 🚫          | 🚫   | ❓   | ❓    |
 | Q3_K       | ✅          | ✅   | ❓   | ❓    |

From 46284a0266507d4729ceaaa406855db1d02616bf Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 22 Aug 2025 02:15:58 +0800
Subject: [PATCH 11/12] ggml-cpu: update unused variables q5_0

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-cpu/arch/s390/quants.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c
index 2901cf9200372..1c8176fb4d91f 100644
--- a/ggml/src/ggml-cpu/arch/s390/quants.c
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -395,9 +395,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     *s = sumf;
 #else
-    UNUSED(ib);
+    UNUSED(nb);
     UNUSED(x);
     UNUSED(y);
+    UNUSED(ib);
     UNUSED(sumf);
     ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 #endif

From 9969fcb0cbd6deba330efdda3bcccef4baae44a3 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 22 Aug 2025 13:56:54 +0800
Subject: [PATCH 12/12] docs: update the last update date

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 docs/build-s390x.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/build-s390x.md b/docs/build-s390x.md
index 9c93885eb4f67..f3cdd63be3ece 100644
--- a/docs/build-s390x.md
+++ b/docs/build-s390x.md
@@ -292,4 +292,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 -   🚫 - acceleration unavailable, will still run using scalar implementation
 -   ❓ - acceleration unknown, please contribute if you can test it yourself
 
-Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 31, 2025.
+Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Aug 22, 2025.