From a57fc345d327d2e5650ff6fbc8c2fdfa4df73e5c Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 6 Aug 2025 14:40:31 +0530 Subject: [PATCH 1/6] Added sve implementation for vec_dot_fp16 Kernel --- ggml/src/ggml-cpu/simd-mappings.h | 41 ++++ ggml/src/ggml-cpu/vec.cpp | 99 ++++++++-- ggml/src/ggml-cpu/vec.h | 308 +++++++++++++++++++++++++----- 3 files changed, 381 insertions(+), 67 deletions(-) diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index b4ad68c9fd647..ad8a99a65458b 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -220,6 +220,47 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_F32_VEC_MUL GGML_F32xt_MUL #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE +// F16 SVE +#define DEFAULT_PG32 svptrue_b32() +#define DEFAULT_PG16 svptrue_b16() + +#define GGML_F32Cxt svfloat16_t +#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f) +#define GGML_F32Cxt_SET1(x) svdup_n_f16(x) +#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p)) +#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec)) + +#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, a, b, c) +#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__) +#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b) +#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__) +#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b) +#define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__) +#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED + +#define GGML_F16x_VEC GGML_F32Cxt +#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO +#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1 +#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p) +#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r) +#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA +#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD +#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL +#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE + +#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a) +#define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__) + +#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \ +{ \ + sum1 = svadd_f16_x(pg16, sum1, sum2); \ + sum3 = svadd_f16_x(pg16, sum3, sum4); \ + sum1 = svadd_f16_x(pg16, sum1, sum3); \ + __fp16 sum_f16 = svaddv_f16(pg16, sum1); \ + (res) = (ggml_float) sum_f16; \ +} +#define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__) + // F16 NEON #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index 07b377bdd82a7..1826f08848bc1 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -198,32 +198,93 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G ggml_float sumf = 0.0; #if defined(GGML_SIMD) - const int np = (n & ~(GGML_F16_STEP - 1)); - - GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; + #if defined(__ARM_FEATURE_SVE) + const int sve_register_length = svcntb() * 8; + const int ggml_f16_epr = sve_register_length / 16; // running when 16 + const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers + + const int np= (n & ~(ggml_f16_step - 1)); + svfloat16_t sum1 = svdup_n_f16(0.0f); + svfloat16_t sum2 = svdup_n_f16(0.0f); + svfloat16_t sum3 = svdup_n_f16(0.0f); + svfloat16_t sum4 = svdup_n_f16(0.0f); + + svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; + svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; + for (int i = 0; i < np; i += ggml_f16_step) { + ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0); + ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); + sum1 = GGML_F16x_VEC_FMA(ax1, ay1, sum1); + + ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1); + ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); + sum2 = GGML_F16x_VEC_FMA(ax2, ay2, sum2); + + ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2); + ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); + sum3 = GGML_F16x_VEC_FMA(ax3, ay3, sum3); + + ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3); + ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); + sum4 = GGML_F16x_VEC_FMA(ax4, ay4, sum4); + + ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4); + ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); + sum1 = GGML_F16x_VEC_FMA(ax5, ay5, sum1); + + ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5); + ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); + sum2 = GGML_F16x_VEC_FMA(ax6, ay6, sum2); + + ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6); + ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); + sum3 = GGML_F16x_VEC_FMA(ax7, ay7, sum3); + + ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7); + ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); + sum4 = GGML_F16x_VEC_FMA(ax8, ay8, sum4); + } - GGML_F16_VEC ax[GGML_F16_ARR]; - GGML_F16_VEC ay[GGML_F16_ARR]; + const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8 + for (int k = np; k < np2; k += ggml_f16_epr) { + svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0); + svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); + sum1 = GGML_F16x_VEC_FMA(rx, ry, sum1); + } - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + if (np2 < n) { + svbool_t pg = svwhilelt_b16(np2,n); + svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2)); + svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); - sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); + sum1 = svmad_f16_x(pg, hx, hy, sum1); } - } + GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4); + #else + const int np = (n & ~(GGML_F16_STEP - 1)); - // reduce sum0..sum3 to sum0 - GGML_F16_VEC_REDUCE(sumf, sum); + GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; - // leftovers - for (int i = np; i < n; ++i) { - sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); - } + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; + + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - // if you hit this, you are likely running outside the FP range - assert(!isnan(sumf) && !isinf(sumf)); + sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); + } + } + + // reduce sum0..sum3 to sum0 + GGML_F16_VEC_REDUCE(sumf, sum); + + // leftovers + for (int i = np; i < n; ++i) { + sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); + } + #endif #else for (int i = 0; i < n; ++i) { sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 2250d93cb00d1..469ec3bc1d5e4 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -119,36 +119,142 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG } #if defined(GGML_SIMD) - const int np = (n & ~(GGML_F16_STEP - 1)); + #if defined(__ARM_FEATURE_SVE) + + const int sve_register_length = svcntb() * 8; + const int ggml_f16_epr = sve_register_length / 16; // running when 16 + const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers + + const int np = (n & ~(ggml_f16_step - 1)); + + svfloat16_t sum_00 = svdup_n_f16(0.0f); + svfloat16_t sum_01 = svdup_n_f16(0.0f); + svfloat16_t sum_02 = svdup_n_f16(0.0f); + svfloat16_t sum_03 = svdup_n_f16(0.0f); + + svfloat16_t sum_10 = svdup_n_f16(0.0f); + svfloat16_t sum_11 = svdup_n_f16(0.0f); + svfloat16_t sum_12 = svdup_n_f16(0.0f); + svfloat16_t sum_13 = svdup_n_f16(0.0f); + + svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; + svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; + + for (int i = 0; i < np; i += ggml_f16_step) { + ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements + + ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst + sum_00 = GGML_F16x_VEC_FMA(ax1, ay1, sum_00); // sum_00 = sum_00+ax1*ay1 + ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements + sum_10 = GGML_F16x_VEC_FMA(ax1, ay1, sum_10); + + ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements + + ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements + sum_01 = GGML_F16x_VEC_FMA(ax2, ay2, sum_01); + ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1); + sum_11 = GGML_F16x_VEC_FMA(ax2, ay2,sum_11); + + ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); + + ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2); + sum_02 = GGML_F16x_VEC_FMA(ax3, ay3, sum_02); + ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2); + sum_12 = GGML_F16x_VEC_FMA(ax3, ay3, sum_12); + + ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); + + ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3); + sum_03 = GGML_F16x_VEC_FMA(ax4, ay4, sum_03); + ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3); + sum_13 = GGML_F16x_VEC_FMA(ax4, ay4, sum_13); + + ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); + + ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4); + + sum_00 = GGML_F16x_VEC_FMA(ax5, ay5, sum_00); + ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4); + sum_10 = GGML_F16x_VEC_FMA(ax5, ay5, sum_10); + + ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); + + ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5); + + sum_01 = GGML_F16x_VEC_FMA(ax6, ay6, sum_01); + ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5); + sum_11 = GGML_F16x_VEC_FMA(ax6, ay6,sum_11); + + ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); + + ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6); + + sum_02 = GGML_F16x_VEC_FMA(ax7, ay7, sum_02); + ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6); + sum_12 = GGML_F16x_VEC_FMA(ax7, ay7, sum_12); + + ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); - GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; + ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7); - GGML_F16_VEC ax[GGML_F16_ARR]; - GGML_F16_VEC ay[GGML_F16_ARR]; + sum_03 = GGML_F16x_VEC_FMA(ax8, ay8, sum_03); + ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7); + sum_13 = GGML_F16x_VEC_FMA(ax8, ay8, sum_13); + } + + const int np2 = (n & ~(ggml_f16_epr - 1)); + for (int k = np; k < np2; k += ggml_f16_epr) { + svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); + + svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0); + sum_00 = GGML_F16x_VEC_FMA(rx, ry, sum_00); + rx = GGML_F16x_VEC_LOAD(x[1] + k, 0); + sum_10 = GGML_F16x_VEC_FMA(rx, ry, sum_10); + } + + if (np2 < n) { + svbool_t pg = svwhilelt_b16(np2, n); + svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2)); + svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2)); + svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00); + sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10); + } + GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03); + GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13); + #else + const int np = (n & ~(GGML_F16_STEP - 1)); + + GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; - for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { - ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; - sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + + for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { + ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); + + sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); + } } } - } - // reduce sum0..sum3 to sum0 - for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { - GGML_F16_VEC_REDUCE(sumf[k], sum[k]); - } + // reduce sum0..sum3 to sum0 + for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { + GGML_F16_VEC_REDUCE(sumf[k], sum[k]); + } - // leftovers - for (int i = np; i < n; ++i) { - for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); + // leftovers + for (int i = np; i < n; ++i) { + for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { + sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); + } } - } + #endif #else for (int i = 0; i < n; ++i) { for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { @@ -276,27 +382,105 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) { #if defined(GGML_SIMD) - const int np = (n & ~(GGML_F16_STEP - 1)); + #if defined(__ARM_FEATURE_SVE) + const int sve_register_length = svcntb() * 8; + const int ggml_f16_epr = sve_register_length / 16; + const int ggml_f16_step = 8 * ggml_f16_epr; - GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); + GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); - GGML_F16_VEC ax[GGML_F16_ARR]; - GGML_F16_VEC ay[GGML_F16_ARR]; + const int np= (n & ~(ggml_f16_step - 1)); - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); + svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; + svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; + for (int i = 0; i < np; i += ggml_f16_step) { + ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0); + ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); + ay1 = GGML_F16x_VEC_FMA(ax1, vx, ay1); - GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); + GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0); + + ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1); + ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); + ay2 = GGML_F16x_VEC_FMA(ax2, vx, ay2); + + GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1); + + ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2); + ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); + ay3 = GGML_F16x_VEC_FMA(ax3, vx, ay3); + + GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2); + + ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3); + ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); + ay4 = GGML_F16x_VEC_FMA(ax4, vx, ay4); + + GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3); + + ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4); + ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); + ay5 = GGML_F16x_VEC_FMA(ax5, vx, ay5); + + GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4); + + ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5); + ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); + ay6 = GGML_F16x_VEC_FMA(ax6, vx, ay6); + + GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5); + + ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6); + ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); + ay7 = GGML_F16x_VEC_FMA(ax7, vx, ay7); + + GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6); + + ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7); + ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); + ay8 = GGML_F16x_VEC_FMA(ax8, vx, ay8); + + GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7); } - } + const int np2 = (n & ~(ggml_f16_epr - 1)); + for (int k = np; k < np2; k += ggml_f16_epr) { + svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0); + svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); + ry = GGML_F16x_VEC_FMA(rx, vx, ry); - // leftovers - for (int i = np; i < n; ++i) { - y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); - } + GGML_F16x_VEC_STORE(y + k, ry, 0); + } + + if (np2 < n) { + svbool_t pg =svwhilelt_b16(np2, n); + svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2)); + svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); + hy = svmad_f16_x(pg,hx,vx,hy); + svst1_f16(pg, (__fp16 *)(y + np2), hy); + } + #else + const int np = (n & ~(GGML_F16_STEP - 1)); + + GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); + + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; + + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); + + GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); + } + } + + // leftovers + for (int i = np; i < n; ++i) { + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); + } + #endif #else // scalar for (int i = 0; i < n; ++i) { @@ -467,25 +651,53 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) { #if defined(GGML_SIMD) - const int np = (n & ~(GGML_F16_STEP - 1)); + #if defined(__ARM_FEATURE_SVE) + const int sve_register_length = svcntb() * 8; + const int ggml_f16_epr = sve_register_length / 16; + const int ggml_f16_step = 2 * ggml_f16_epr; + + GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); + const int np = (n & ~(ggml_f16_step - 1)); + svfloat16_t ay1, ay2; + + for (int i = 0; i < np; i += ggml_f16_step) { + ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0); + ay1 = GGML_F16x_VEC_MUL(ay1, vx); + GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0); + + ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1); + ay2 = GGML_F16x_VEC_MUL(ay2, vx); + GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1); + } + // leftovers + // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only + if (np < n) { + svbool_t pg = svwhilelt_b16(np, n); + svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np)); + svfloat16_t out = svmul_f16_m( pg, hy, vx ); + svst1_f16(pg, (__fp16 *)(y + np), out); + } + #else + const int np = (n & ~(GGML_F16_STEP - 1)); - GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); + GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); - GGML_F16_VEC ay[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_MUL(ay[j], vx); + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_MUL(ay[j], vx); - GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); + GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); + } } - } - // leftovers - for (int i = np; i < n; ++i) { - y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); - } + // leftovers + for (int i = np; i < n; ++i) { + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); + } + #endif #else // scalar for (int i = 0; i < n; ++i) { From f89d04eccfbaa4aa0e278cc57814c670402c5d82 Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 6 Aug 2025 15:42:05 +0530 Subject: [PATCH 2/6] removed white spaces --- ggml/src/ggml-cpu/vec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 469ec3bc1d5e4..2840598d3e417 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -659,7 +659,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); const int np = (n & ~(ggml_f16_step - 1)); svfloat16_t ay1, ay2; - + for (int i = 0; i < np; i += ggml_f16_step) { ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0); ay1 = GGML_F16x_VEC_MUL(ay1, vx); From 543d5399cf1cff0d99b55c436240fbf35d726302 Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 6 Aug 2025 15:59:25 +0530 Subject: [PATCH 3/6] Added comment --- ggml/src/ggml-cpu/vec.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index 1826f08848bc1..6e9b7ac90ecc9 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -199,7 +199,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G #if defined(GGML_SIMD) #if defined(__ARM_FEATURE_SVE) - const int sve_register_length = svcntb() * 8; + const int sve_register_length = svcntb() * 8; //get vector length const int ggml_f16_epr = sve_register_length / 16; // running when 16 const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers From 15e0c7945e08d89e6d9561a946131832226b0687 Mon Sep 17 00:00:00 2001 From: vithulep Date: Wed, 6 Aug 2025 16:00:39 +0530 Subject: [PATCH 4/6] removed white spaces --- ggml/src/ggml-cpu/vec.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index 6e9b7ac90ecc9..a96b721d92881 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -199,7 +199,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G #if defined(GGML_SIMD) #if defined(__ARM_FEATURE_SVE) - const int sve_register_length = svcntb() * 8; //get vector length + const int sve_register_length = svcntb() * 8; //get vector length const int ggml_f16_epr = sve_register_length / 16; // running when 16 const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers From 0aca430c7174a3482d6cd25db5c6eba7c6ece461 Mon Sep 17 00:00:00 2001 From: vithulep Date: Thu, 7 Aug 2025 11:30:25 +0530 Subject: [PATCH 5/6] changed GGML_F16x_VEC_FMA for code consistency --- ggml/src/ggml-cpu/simd-mappings.h | 2 +- ggml/src/ggml-cpu/vec.cpp | 20 +++++------ ggml/src/ggml-cpu/vec.h | 60 +++++++++++++++---------------- 3 files changed, 41 insertions(+), 41 deletions(-) diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index ad8a99a65458b..7eede4c83209c 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -230,7 +230,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p)) #define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec)) -#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, a, b, c) +#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a) #define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__) #define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b) #define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__) diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index a96b721d92881..7e84f2cb8cfb7 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -214,46 +214,46 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G for (int i = 0; i < np; i += ggml_f16_step) { ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0); ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); - sum1 = GGML_F16x_VEC_FMA(ax1, ay1, sum1); + sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1); ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1); ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); - sum2 = GGML_F16x_VEC_FMA(ax2, ay2, sum2); + sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2); ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2); ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); - sum3 = GGML_F16x_VEC_FMA(ax3, ay3, sum3); + sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3); ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3); ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); - sum4 = GGML_F16x_VEC_FMA(ax4, ay4, sum4); + sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4); ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4); ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); - sum1 = GGML_F16x_VEC_FMA(ax5, ay5, sum1); + sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5); ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5); ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); - sum2 = GGML_F16x_VEC_FMA(ax6, ay6, sum2); + sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6); ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6); ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); - sum3 = GGML_F16x_VEC_FMA(ax7, ay7, sum3); + sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7); ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7); ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); - sum4 = GGML_F16x_VEC_FMA(ax8, ay8, sum4); + sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8); } const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8 for (int k = np; k < np2; k += ggml_f16_epr) { svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0); svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); - sum1 = GGML_F16x_VEC_FMA(rx, ry, sum1); + sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry); } if (np2 < n) { - svbool_t pg = svwhilelt_b16(np2,n); + svbool_t pg = svwhilelt_b16(np2, n); svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2)); svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 2840598d3e417..901e29e554d29 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -144,62 +144,62 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst - sum_00 = GGML_F16x_VEC_FMA(ax1, ay1, sum_00); // sum_00 = sum_00+ax1*ay1 + sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1 ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements - sum_10 = GGML_F16x_VEC_FMA(ax1, ay1, sum_10); + sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1); ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements - sum_01 = GGML_F16x_VEC_FMA(ax2, ay2, sum_01); + sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2); ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1); - sum_11 = GGML_F16x_VEC_FMA(ax2, ay2,sum_11); + sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2); ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2); - sum_02 = GGML_F16x_VEC_FMA(ax3, ay3, sum_02); + sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3); ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2); - sum_12 = GGML_F16x_VEC_FMA(ax3, ay3, sum_12); + sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3); ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3); - sum_03 = GGML_F16x_VEC_FMA(ax4, ay4, sum_03); + sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4); ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3); - sum_13 = GGML_F16x_VEC_FMA(ax4, ay4, sum_13); + sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4); ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4); - sum_00 = GGML_F16x_VEC_FMA(ax5, ay5, sum_00); + sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5); ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4); - sum_10 = GGML_F16x_VEC_FMA(ax5, ay5, sum_10); + sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5); ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5); - sum_01 = GGML_F16x_VEC_FMA(ax6, ay6, sum_01); + sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6); ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5); - sum_11 = GGML_F16x_VEC_FMA(ax6, ay6,sum_11); + sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6); ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6); - sum_02 = GGML_F16x_VEC_FMA(ax7, ay7, sum_02); + sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7); ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6); - sum_12 = GGML_F16x_VEC_FMA(ax7, ay7, sum_12); + sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7); ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7); - sum_03 = GGML_F16x_VEC_FMA(ax8, ay8, sum_03); + sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8); ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7); - sum_13 = GGML_F16x_VEC_FMA(ax8, ay8, sum_13); + sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8); } const int np2 = (n & ~(ggml_f16_epr - 1)); @@ -207,9 +207,9 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0); - sum_00 = GGML_F16x_VEC_FMA(rx, ry, sum_00); + sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry); rx = GGML_F16x_VEC_LOAD(x[1] + k, 0); - sum_10 = GGML_F16x_VEC_FMA(rx, ry, sum_10); + sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry); } if (np2 < n) { @@ -396,49 +396,49 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, for (int i = 0; i < np; i += ggml_f16_step) { ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0); ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); - ay1 = GGML_F16x_VEC_FMA(ax1, vx, ay1); + ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx); GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0); ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1); ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); - ay2 = GGML_F16x_VEC_FMA(ax2, vx, ay2); + ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx); GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1); ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2); ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); - ay3 = GGML_F16x_VEC_FMA(ax3, vx, ay3); + ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx); GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2); ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3); ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); - ay4 = GGML_F16x_VEC_FMA(ax4, vx, ay4); + ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx); GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3); ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4); ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); - ay5 = GGML_F16x_VEC_FMA(ax5, vx, ay5); + ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx); GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4); ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5); ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); - ay6 = GGML_F16x_VEC_FMA(ax6, vx, ay6); + ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx); GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5); ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6); ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); - ay7 = GGML_F16x_VEC_FMA(ax7, vx, ay7); + ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx); GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6); ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7); ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); - ay8 = GGML_F16x_VEC_FMA(ax8, vx, ay8); + ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx); GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7); } @@ -446,16 +446,16 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, for (int k = np; k < np2; k += ggml_f16_epr) { svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0); svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); - ry = GGML_F16x_VEC_FMA(rx, vx, ry); + ry = GGML_F16x_VEC_FMA(ry, rx, vx); GGML_F16x_VEC_STORE(y + k, ry, 0); } if (np2 < n) { - svbool_t pg =svwhilelt_b16(np2, n); + svbool_t pg = svwhilelt_b16(np2, n); svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2)); svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); - hy = svmad_f16_x(pg,hx,vx,hy); + hy = svmad_f16_x(pg, hx, vx, hy); svst1_f16(pg, (__fp16 *)(y + np2), hy); } #else @@ -674,7 +674,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float if (np < n) { svbool_t pg = svwhilelt_b16(np, n); svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np)); - svfloat16_t out = svmul_f16_m( pg, hy, vx ); + svfloat16_t out = svmul_f16_m(pg, hy, vx); svst1_f16(pg, (__fp16 *)(y + np), out); } #else From 0e310f9e0478926667f02cbb97c97176442598aa Mon Sep 17 00:00:00 2001 From: Prashant Vithule <119530321+Vithulep@users.noreply.github.com> Date: Thu, 28 Aug 2025 12:55:22 +0530 Subject: [PATCH 6/6] Update vec.h --- ggml/src/ggml-cpu/vec.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index f58838a4f3ef1..1f7c5996be96c 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -223,7 +223,6 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG } GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03); GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13); - #elif defined(__riscv_v_intrinsic) // todo: RVV impl for (int i = 0; i < n; ++i) { @@ -481,7 +480,6 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, for (int i = 0; i < n; ++i) { y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); } - #else const int np = (n & ~(GGML_F16_STEP - 1));