From a57fc345d327d2e5650ff6fbc8c2fdfa4df73e5c Mon Sep 17 00:00:00 2001
From: vithulep
Date: Wed, 6 Aug 2025 14:40:31 +0530
Subject: [PATCH 1/6] Added sve implementation for vec_dot_fp16 Kernel
---
ggml/src/ggml-cpu/simd-mappings.h | 41 ++++
ggml/src/ggml-cpu/vec.cpp | 99 ++++++++--
ggml/src/ggml-cpu/vec.h | 308 +++++++++++++++++++++++++-----
3 files changed, 381 insertions(+), 67 deletions(-)
diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index b4ad68c9fd647..ad8a99a65458b 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -220,6 +220,47 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
#define GGML_F32_VEC_MUL GGML_F32xt_MUL
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
+// F16 SVE
+#define DEFAULT_PG32 svptrue_b32()
+#define DEFAULT_PG16 svptrue_b16()
+
+#define GGML_F32Cxt svfloat16_t
+#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
+#define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
+#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
+#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
+
+#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, a, b, c)
+#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
+#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
+#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
+#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
+#define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
+#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
+
+#define GGML_F16x_VEC GGML_F32Cxt
+#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
+#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
+#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
+#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
+#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
+#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
+#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
+#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
+
+#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
+#define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
+
+#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
+{ \
+ sum1 = svadd_f16_x(pg16, sum1, sum2); \
+ sum3 = svadd_f16_x(pg16, sum3, sum4); \
+ sum1 = svadd_f16_x(pg16, sum1, sum3); \
+ __fp16 sum_f16 = svaddv_f16(pg16, sum1); \
+ (res) = (ggml_float) sum_f16; \
+}
+#define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
+
// F16 NEON
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
index 07b377bdd82a7..1826f08848bc1 100644
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@@ -198,32 +198,93 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
ggml_float sumf = 0.0;
#if defined(GGML_SIMD)
- const int np = (n & ~(GGML_F16_STEP - 1));
-
- GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
+ #if defined(__ARM_FEATURE_SVE)
+ const int sve_register_length = svcntb() * 8;
+ const int ggml_f16_epr = sve_register_length / 16; // running when 16
+ const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
+
+ const int np= (n & ~(ggml_f16_step - 1));
+ svfloat16_t sum1 = svdup_n_f16(0.0f);
+ svfloat16_t sum2 = svdup_n_f16(0.0f);
+ svfloat16_t sum3 = svdup_n_f16(0.0f);
+ svfloat16_t sum4 = svdup_n_f16(0.0f);
+
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
+ for (int i = 0; i < np; i += ggml_f16_step) {
+ ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
+ sum1 = GGML_F16x_VEC_FMA(ax1, ay1, sum1);
+
+ ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
+ sum2 = GGML_F16x_VEC_FMA(ax2, ay2, sum2);
+
+ ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
+ sum3 = GGML_F16x_VEC_FMA(ax3, ay3, sum3);
+
+ ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
+ sum4 = GGML_F16x_VEC_FMA(ax4, ay4, sum4);
+
+ ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
+ sum1 = GGML_F16x_VEC_FMA(ax5, ay5, sum1);
+
+ ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
+ sum2 = GGML_F16x_VEC_FMA(ax6, ay6, sum2);
+
+ ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
+ sum3 = GGML_F16x_VEC_FMA(ax7, ay7, sum3);
+
+ ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
+ sum4 = GGML_F16x_VEC_FMA(ax8, ay8, sum4);
+ }
- GGML_F16_VEC ax[GGML_F16_ARR];
- GGML_F16_VEC ay[GGML_F16_ARR];
+ const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
+ for (int k = np; k < np2; k += ggml_f16_epr) {
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
+ sum1 = GGML_F16x_VEC_FMA(rx, ry, sum1);
+ }
- for (int i = 0; i < np; i += GGML_F16_STEP) {
- for (int j = 0; j < GGML_F16_ARR; j++) {
- ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+ if (np2 < n) {
+ svbool_t pg = svwhilelt_b16(np2,n);
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
- sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
+ sum1 = svmad_f16_x(pg, hx, hy, sum1);
}
- }
+ GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
+ #else
+ const int np = (n & ~(GGML_F16_STEP - 1));
- // reduce sum0..sum3 to sum0
- GGML_F16_VEC_REDUCE(sumf, sum);
+ GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
- // leftovers
- for (int i = np; i < n; ++i) {
- sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
- }
+ GGML_F16_VEC ax[GGML_F16_ARR];
+ GGML_F16_VEC ay[GGML_F16_ARR];
+
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
+ for (int j = 0; j < GGML_F16_ARR; j++) {
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
- // if you hit this, you are likely running outside the FP range
- assert(!isnan(sumf) && !isinf(sumf));
+ sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
+ }
+ }
+
+ // reduce sum0..sum3 to sum0
+ GGML_F16_VEC_REDUCE(sumf, sum);
+
+ // leftovers
+ for (int i = np; i < n; ++i) {
+ sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
+ }
+ #endif
#else
for (int i = 0; i < n; ++i) {
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
index 2250d93cb00d1..469ec3bc1d5e4 100644
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -119,36 +119,142 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
}
#if defined(GGML_SIMD)
- const int np = (n & ~(GGML_F16_STEP - 1));
+ #if defined(__ARM_FEATURE_SVE)
+
+ const int sve_register_length = svcntb() * 8;
+ const int ggml_f16_epr = sve_register_length / 16; // running when 16
+ const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
+
+ const int np = (n & ~(ggml_f16_step - 1));
+
+ svfloat16_t sum_00 = svdup_n_f16(0.0f);
+ svfloat16_t sum_01 = svdup_n_f16(0.0f);
+ svfloat16_t sum_02 = svdup_n_f16(0.0f);
+ svfloat16_t sum_03 = svdup_n_f16(0.0f);
+
+ svfloat16_t sum_10 = svdup_n_f16(0.0f);
+ svfloat16_t sum_11 = svdup_n_f16(0.0f);
+ svfloat16_t sum_12 = svdup_n_f16(0.0f);
+ svfloat16_t sum_13 = svdup_n_f16(0.0f);
+
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
+
+ for (int i = 0; i < np; i += ggml_f16_step) {
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
+
+ ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
+ sum_00 = GGML_F16x_VEC_FMA(ax1, ay1, sum_00); // sum_00 = sum_00+ax1*ay1
+ ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
+ sum_10 = GGML_F16x_VEC_FMA(ax1, ay1, sum_10);
+
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
+
+ ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
+ sum_01 = GGML_F16x_VEC_FMA(ax2, ay2, sum_01);
+ ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
+ sum_11 = GGML_F16x_VEC_FMA(ax2, ay2,sum_11);
+
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
+
+ ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
+ sum_02 = GGML_F16x_VEC_FMA(ax3, ay3, sum_02);
+ ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
+ sum_12 = GGML_F16x_VEC_FMA(ax3, ay3, sum_12);
+
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
+
+ ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
+ sum_03 = GGML_F16x_VEC_FMA(ax4, ay4, sum_03);
+ ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
+ sum_13 = GGML_F16x_VEC_FMA(ax4, ay4, sum_13);
+
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
+
+ ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
+
+ sum_00 = GGML_F16x_VEC_FMA(ax5, ay5, sum_00);
+ ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
+ sum_10 = GGML_F16x_VEC_FMA(ax5, ay5, sum_10);
+
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
+
+ ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
+
+ sum_01 = GGML_F16x_VEC_FMA(ax6, ay6, sum_01);
+ ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
+ sum_11 = GGML_F16x_VEC_FMA(ax6, ay6,sum_11);
+
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
+
+ ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
+
+ sum_02 = GGML_F16x_VEC_FMA(ax7, ay7, sum_02);
+ ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
+ sum_12 = GGML_F16x_VEC_FMA(ax7, ay7, sum_12);
+
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
- GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
+ ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
- GGML_F16_VEC ax[GGML_F16_ARR];
- GGML_F16_VEC ay[GGML_F16_ARR];
+ sum_03 = GGML_F16x_VEC_FMA(ax8, ay8, sum_03);
+ ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
+ sum_13 = GGML_F16x_VEC_FMA(ax8, ay8, sum_13);
+ }
+
+ const int np2 = (n & ~(ggml_f16_epr - 1));
+ for (int k = np; k < np2; k += ggml_f16_epr) {
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
+
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
+ sum_00 = GGML_F16x_VEC_FMA(rx, ry, sum_00);
+ rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
+ sum_10 = GGML_F16x_VEC_FMA(rx, ry, sum_10);
+ }
+
+ if (np2 < n) {
+ svbool_t pg = svwhilelt_b16(np2, n);
+ svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
+ svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
- for (int i = 0; i < np; i += GGML_F16_STEP) {
- for (int j = 0; j < GGML_F16_ARR; j++) {
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+ sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
+ sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
+ }
+ GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
+ GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
+ #else
+ const int np = (n & ~(GGML_F16_STEP - 1));
+
+ GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
- for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
- ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
+ GGML_F16_VEC ax[GGML_F16_ARR];
+ GGML_F16_VEC ay[GGML_F16_ARR];
- sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
+ for (int j = 0; j < GGML_F16_ARR; j++) {
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
+ ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
+
+ sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
+ }
}
}
- }
- // reduce sum0..sum3 to sum0
- for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
- GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
- }
+ // reduce sum0..sum3 to sum0
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
+ GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
+ }
- // leftovers
- for (int i = np; i < n; ++i) {
- for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
- sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
+ // leftovers
+ for (int i = np; i < n; ++i) {
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
+ }
}
- }
+ #endif
#else
for (int i = 0; i < n; ++i) {
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
@@ -276,27 +382,105 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
#if defined(GGML_SIMD)
- const int np = (n & ~(GGML_F16_STEP - 1));
+ #if defined(__ARM_FEATURE_SVE)
+ const int sve_register_length = svcntb() * 8;
+ const int ggml_f16_epr = sve_register_length / 16;
+ const int ggml_f16_step = 8 * ggml_f16_epr;
- GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
- GGML_F16_VEC ax[GGML_F16_ARR];
- GGML_F16_VEC ay[GGML_F16_ARR];
+ const int np= (n & ~(ggml_f16_step - 1));
- for (int i = 0; i < np; i += GGML_F16_STEP) {
- for (int j = 0; j < GGML_F16_ARR; j++) {
- ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
- ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
+ for (int i = 0; i < np; i += ggml_f16_step) {
+ ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
+ ay1 = GGML_F16x_VEC_FMA(ax1, vx, ay1);
- GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+ GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
+
+ ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
+ ay2 = GGML_F16x_VEC_FMA(ax2, vx, ay2);
+
+ GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
+
+ ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
+ ay3 = GGML_F16x_VEC_FMA(ax3, vx, ay3);
+
+ GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
+
+ ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
+ ay4 = GGML_F16x_VEC_FMA(ax4, vx, ay4);
+
+ GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
+
+ ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
+ ay5 = GGML_F16x_VEC_FMA(ax5, vx, ay5);
+
+ GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
+
+ ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
+ ay6 = GGML_F16x_VEC_FMA(ax6, vx, ay6);
+
+ GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
+
+ ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
+ ay7 = GGML_F16x_VEC_FMA(ax7, vx, ay7);
+
+ GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
+
+ ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
+ ay8 = GGML_F16x_VEC_FMA(ax8, vx, ay8);
+
+ GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
}
- }
+ const int np2 = (n & ~(ggml_f16_epr - 1));
+ for (int k = np; k < np2; k += ggml_f16_epr) {
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
+ ry = GGML_F16x_VEC_FMA(rx, vx, ry);
- // leftovers
- for (int i = np; i < n; ++i) {
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
- }
+ GGML_F16x_VEC_STORE(y + k, ry, 0);
+ }
+
+ if (np2 < n) {
+ svbool_t pg =svwhilelt_b16(np2, n);
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
+ hy = svmad_f16_x(pg,hx,vx,hy);
+ svst1_f16(pg, (__fp16 *)(y + np2), hy);
+ }
+ #else
+ const int np = (n & ~(GGML_F16_STEP - 1));
+
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+
+ GGML_F16_VEC ax[GGML_F16_ARR];
+ GGML_F16_VEC ay[GGML_F16_ARR];
+
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
+ for (int j = 0; j < GGML_F16_ARR; j++) {
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
+
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+ }
+ }
+
+ // leftovers
+ for (int i = np; i < n; ++i) {
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
+ }
+ #endif
#else
// scalar
for (int i = 0; i < n; ++i) {
@@ -467,25 +651,53 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
#if defined(GGML_SIMD)
- const int np = (n & ~(GGML_F16_STEP - 1));
+ #if defined(__ARM_FEATURE_SVE)
+ const int sve_register_length = svcntb() * 8;
+ const int ggml_f16_epr = sve_register_length / 16;
+ const int ggml_f16_step = 2 * ggml_f16_epr;
+
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
+ const int np = (n & ~(ggml_f16_step - 1));
+ svfloat16_t ay1, ay2;
+
+ for (int i = 0; i < np; i += ggml_f16_step) {
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
+ ay1 = GGML_F16x_VEC_MUL(ay1, vx);
+ GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
+
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
+ ay2 = GGML_F16x_VEC_MUL(ay2, vx);
+ GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
+ }
+ // leftovers
+ // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
+ if (np < n) {
+ svbool_t pg = svwhilelt_b16(np, n);
+ svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
+ svfloat16_t out = svmul_f16_m( pg, hy, vx );
+ svst1_f16(pg, (__fp16 *)(y + np), out);
+ }
+ #else
+ const int np = (n & ~(GGML_F16_STEP - 1));
- GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
- GGML_F16_VEC ay[GGML_F16_ARR];
+ GGML_F16_VEC ay[GGML_F16_ARR];
- for (int i = 0; i < np; i += GGML_F16_STEP) {
- for (int j = 0; j < GGML_F16_ARR; j++) {
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
- ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
+ for (int j = 0; j < GGML_F16_ARR; j++) {
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+ ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
- GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+ }
}
- }
- // leftovers
- for (int i = np; i < n; ++i) {
- y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
- }
+ // leftovers
+ for (int i = np; i < n; ++i) {
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
+ }
+ #endif
#else
// scalar
for (int i = 0; i < n; ++i) {
From f89d04eccfbaa4aa0e278cc57814c670402c5d82 Mon Sep 17 00:00:00 2001
From: vithulep
Date: Wed, 6 Aug 2025 15:42:05 +0530
Subject: [PATCH 2/6] removed white spaces
---
ggml/src/ggml-cpu/vec.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
index 469ec3bc1d5e4..2840598d3e417 100644
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -659,7 +659,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
const int np = (n & ~(ggml_f16_step - 1));
svfloat16_t ay1, ay2;
-
+
for (int i = 0; i < np; i += ggml_f16_step) {
ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
ay1 = GGML_F16x_VEC_MUL(ay1, vx);
From 543d5399cf1cff0d99b55c436240fbf35d726302 Mon Sep 17 00:00:00 2001
From: vithulep
Date: Wed, 6 Aug 2025 15:59:25 +0530
Subject: [PATCH 3/6] Added comment
---
ggml/src/ggml-cpu/vec.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
index 1826f08848bc1..6e9b7ac90ecc9 100644
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@@ -199,7 +199,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
#if defined(GGML_SIMD)
#if defined(__ARM_FEATURE_SVE)
- const int sve_register_length = svcntb() * 8;
+ const int sve_register_length = svcntb() * 8; //get vector length
const int ggml_f16_epr = sve_register_length / 16; // running when 16
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
From 15e0c7945e08d89e6d9561a946131832226b0687 Mon Sep 17 00:00:00 2001
From: vithulep
Date: Wed, 6 Aug 2025 16:00:39 +0530
Subject: [PATCH 4/6] removed white spaces
---
ggml/src/ggml-cpu/vec.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
index 6e9b7ac90ecc9..a96b721d92881 100644
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@@ -199,7 +199,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
#if defined(GGML_SIMD)
#if defined(__ARM_FEATURE_SVE)
- const int sve_register_length = svcntb() * 8; //get vector length
+ const int sve_register_length = svcntb() * 8; //get vector length
const int ggml_f16_epr = sve_register_length / 16; // running when 16
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
From 0aca430c7174a3482d6cd25db5c6eba7c6ece461 Mon Sep 17 00:00:00 2001
From: vithulep
Date: Thu, 7 Aug 2025 11:30:25 +0530
Subject: [PATCH 5/6] changed GGML_F16x_VEC_FMA for code consistency
---
ggml/src/ggml-cpu/simd-mappings.h | 2 +-
ggml/src/ggml-cpu/vec.cpp | 20 +++++------
ggml/src/ggml-cpu/vec.h | 60 +++++++++++++++----------------
3 files changed, 41 insertions(+), 41 deletions(-)
diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index ad8a99a65458b..7eede4c83209c 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -230,7 +230,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
-#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, a, b, c)
+#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
index a96b721d92881..7e84f2cb8cfb7 100644
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@@ -214,46 +214,46 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
for (int i = 0; i < np; i += ggml_f16_step) {
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
- sum1 = GGML_F16x_VEC_FMA(ax1, ay1, sum1);
+ sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
- sum2 = GGML_F16x_VEC_FMA(ax2, ay2, sum2);
+ sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
- sum3 = GGML_F16x_VEC_FMA(ax3, ay3, sum3);
+ sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
- sum4 = GGML_F16x_VEC_FMA(ax4, ay4, sum4);
+ sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
- sum1 = GGML_F16x_VEC_FMA(ax5, ay5, sum1);
+ sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
- sum2 = GGML_F16x_VEC_FMA(ax6, ay6, sum2);
+ sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
- sum3 = GGML_F16x_VEC_FMA(ax7, ay7, sum3);
+ sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
- sum4 = GGML_F16x_VEC_FMA(ax8, ay8, sum4);
+ sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
}
const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
for (int k = np; k < np2; k += ggml_f16_epr) {
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
- sum1 = GGML_F16x_VEC_FMA(rx, ry, sum1);
+ sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
}
if (np2 < n) {
- svbool_t pg = svwhilelt_b16(np2,n);
+ svbool_t pg = svwhilelt_b16(np2, n);
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
index 2840598d3e417..901e29e554d29 100644
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -144,62 +144,62 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
- sum_00 = GGML_F16x_VEC_FMA(ax1, ay1, sum_00); // sum_00 = sum_00+ax1*ay1
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
- sum_10 = GGML_F16x_VEC_FMA(ax1, ay1, sum_10);
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
- sum_01 = GGML_F16x_VEC_FMA(ax2, ay2, sum_01);
+ sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
- sum_11 = GGML_F16x_VEC_FMA(ax2, ay2,sum_11);
+ sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
- sum_02 = GGML_F16x_VEC_FMA(ax3, ay3, sum_02);
+ sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
- sum_12 = GGML_F16x_VEC_FMA(ax3, ay3, sum_12);
+ sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
- sum_03 = GGML_F16x_VEC_FMA(ax4, ay4, sum_03);
+ sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
- sum_13 = GGML_F16x_VEC_FMA(ax4, ay4, sum_13);
+ sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
- sum_00 = GGML_F16x_VEC_FMA(ax5, ay5, sum_00);
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
- sum_10 = GGML_F16x_VEC_FMA(ax5, ay5, sum_10);
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
- sum_01 = GGML_F16x_VEC_FMA(ax6, ay6, sum_01);
+ sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
- sum_11 = GGML_F16x_VEC_FMA(ax6, ay6,sum_11);
+ sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
- sum_02 = GGML_F16x_VEC_FMA(ax7, ay7, sum_02);
+ sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
- sum_12 = GGML_F16x_VEC_FMA(ax7, ay7, sum_12);
+ sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
- sum_03 = GGML_F16x_VEC_FMA(ax8, ay8, sum_03);
+ sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
- sum_13 = GGML_F16x_VEC_FMA(ax8, ay8, sum_13);
+ sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
}
const int np2 = (n & ~(ggml_f16_epr - 1));
@@ -207,9 +207,9 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
- sum_00 = GGML_F16x_VEC_FMA(rx, ry, sum_00);
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
- sum_10 = GGML_F16x_VEC_FMA(rx, ry, sum_10);
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
}
if (np2 < n) {
@@ -396,49 +396,49 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
for (int i = 0; i < np; i += ggml_f16_step) {
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
- ay1 = GGML_F16x_VEC_FMA(ax1, vx, ay1);
+ ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
- ay2 = GGML_F16x_VEC_FMA(ax2, vx, ay2);
+ ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
- ay3 = GGML_F16x_VEC_FMA(ax3, vx, ay3);
+ ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
- ay4 = GGML_F16x_VEC_FMA(ax4, vx, ay4);
+ ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
- ay5 = GGML_F16x_VEC_FMA(ax5, vx, ay5);
+ ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
- ay6 = GGML_F16x_VEC_FMA(ax6, vx, ay6);
+ ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
- ay7 = GGML_F16x_VEC_FMA(ax7, vx, ay7);
+ ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
- ay8 = GGML_F16x_VEC_FMA(ax8, vx, ay8);
+ ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
}
@@ -446,16 +446,16 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
for (int k = np; k < np2; k += ggml_f16_epr) {
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
- ry = GGML_F16x_VEC_FMA(rx, vx, ry);
+ ry = GGML_F16x_VEC_FMA(ry, rx, vx);
GGML_F16x_VEC_STORE(y + k, ry, 0);
}
if (np2 < n) {
- svbool_t pg =svwhilelt_b16(np2, n);
+ svbool_t pg = svwhilelt_b16(np2, n);
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
- hy = svmad_f16_x(pg,hx,vx,hy);
+ hy = svmad_f16_x(pg, hx, vx, hy);
svst1_f16(pg, (__fp16 *)(y + np2), hy);
}
#else
@@ -674,7 +674,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
if (np < n) {
svbool_t pg = svwhilelt_b16(np, n);
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
- svfloat16_t out = svmul_f16_m( pg, hy, vx );
+ svfloat16_t out = svmul_f16_m(pg, hy, vx);
svst1_f16(pg, (__fp16 *)(y + np), out);
}
#else
From 0e310f9e0478926667f02cbb97c97176442598aa Mon Sep 17 00:00:00 2001
From: Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
Date: Thu, 28 Aug 2025 12:55:22 +0530
Subject: [PATCH 6/6] Update vec.h
---
ggml/src/ggml-cpu/vec.h | 2 --
1 file changed, 2 deletions(-)
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
index f58838a4f3ef1..1f7c5996be96c 100644
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -223,7 +223,6 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
}
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
-
#elif defined(__riscv_v_intrinsic)
// todo: RVV impl
for (int i = 0; i < n; ++i) {
@@ -481,7 +480,6 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
for (int i = 0; i < n; ++i) {
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
}
-
#else
const int np = (n & ~(GGML_F16_STEP - 1));