Skip to content

Commit 20d2017

Browse files
committed
ggml-cpu : add more rvv ops
1 parent 3492e6b commit 20d2017

File tree

3 files changed

+48
-2
lines changed

3 files changed

+48
-2
lines changed

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3221,6 +3221,13 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
32213221
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
32223222
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
32233223
}
3224+
#elif defined(__riscv_zvfh)
3225+
for (int vl; i < n; i += vl) {
3226+
vl = __riscv_vsetvl_e32m2(n - i);
3227+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
3228+
vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
3229+
__riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
3230+
}
32243231
#endif
32253232
for (; i < n; ++i) {
32263233
y[i] = GGML_CPU_FP32_TO_FP16(x[i]);

ggml/src/ggml-cpu/vec.cpp

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
215215
ggml_float sumf = 0.0;
216216

217217

218-
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
218+
#if defined(GGML_SIMD)
219219
#if defined(__ARM_FEATURE_SVE)
220220
const int sve_register_length = svcntb() * 8; //get vector length
221221
const int ggml_f16_epr = sve_register_length / 16; // running when 16
@@ -278,6 +278,29 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
278278
sum1 = svmad_f16_x(pg, hx, hy, sum1);
279279
}
280280
GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
281+
#elif defined(__riscv_v_intrinsic)
282+
#if defined(__riscv_zvfh)
283+
int vl = __riscv_vsetvlmax_e32m2();
284+
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
285+
vfloat32m2_t vsum;
286+
vfloat16m1_t ax;
287+
vfloat16m1_t ay;
288+
vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
289+
for (int i = 0; i < n; i += vl) {
290+
vl = __riscv_vsetvl_e16m1(n - i);
291+
ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
292+
ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
293+
vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
294+
}
295+
vl = __riscv_vsetvlmax_e32m1();
296+
vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
297+
vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
298+
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
299+
#else
300+
for (int i = 0; i < n; ++i) {
301+
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
302+
}
303+
#endif // __riscv_zvfh
281304
#else
282305
const int np = (n & ~(GGML_F16_STEP - 1));
283306

@@ -309,7 +332,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
309332
for (int i = 0; i < n; ++i) {
310333
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
311334
}
312-
#endif
335+
#endif // GGML_SIMD
313336

314337
*s = sumf;
315338
}
@@ -368,6 +391,14 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
368391
for (; i + 3 < n; i += 4) {
369392
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
370393
}
394+
#elif defined(__riscv_v_intrinsic)
395+
for (int vl; i < n; i += vl) {
396+
vl = __riscv_vsetvl_e32m2(n - i);
397+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
398+
vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
399+
vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl);
400+
__riscv_vse32_v_f32m2(&y[i], vy, vl);
401+
}
371402
#endif
372403
for (; i < n; ++i) {
373404
y[i] = ggml_silu_f32(x[i]) * g[i];

ggml/src/ggml-cpu/vec.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1269,6 +1269,14 @@ inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
12691269
vl);
12701270
}
12711271

1272+
// computes silu x/(1+exp(-x)) in single precision vector
1273+
inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1274+
const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1275+
const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
1276+
const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
1277+
return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1278+
}
1279+
12721280
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
12731281

12741282
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {

0 commit comments

Comments
 (0)