Skip to content

Commit af3102c

Browse files
committed
ggml-cpu : add more rvv ops
1 parent 9724607 commit af3102c

File tree

3 files changed

+50
-2
lines changed

3 files changed

+50
-2
lines changed

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3221,6 +3221,13 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
32213221
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
32223222
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
32233223
}
3224+
#elif defined(__riscv_zvfh)
3225+
for (int vl; i < n; i += vl) {
3226+
vl = __riscv_vsetvl_e32m2(n - i);
3227+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
3228+
vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
3229+
__riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
3230+
}
32243231
#endif
32253232
for (; i < n; ++i) {
32263233
y[i] = GGML_CPU_FP32_TO_FP16(x[i]);

ggml/src/ggml-cpu/vec.cpp

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,31 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
214214

215215
ggml_float sumf = 0.0;
216216

217-
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
217+
#if defined(GGML_SIMD)
218+
#if defined(__riscv_v_intrinsic)
219+
#if defined(__riscv_zvfh)
220+
int vl = __riscv_vsetvlmax_e32m2();
221+
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
222+
vfloat32m2_t vsum;
223+
vfloat16m1_t ax;
224+
vfloat16m1_t ay;
225+
vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
226+
for (int i = 0; i < n; i += vl) {
227+
vl = __riscv_vsetvl_e16m1(n - i);
228+
ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
229+
ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
230+
vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
231+
}
232+
vl = __riscv_vsetvlmax_e32m1();
233+
vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
234+
vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
235+
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
236+
#else
237+
for (int i = 0; i < n; ++i) {
238+
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
239+
}
240+
#endif // __riscv_zvfh
241+
#else
218242
const int np = (n & ~(GGML_F16_STEP - 1));
219243

220244
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
@@ -241,11 +265,12 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
241265

242266
// if you hit this, you are likely running outside the FP range
243267
assert(!isnan(sumf) && !isinf(sumf));
268+
#endif // __riscv_v_intrinsic
244269
#else
245270
for (int i = 0; i < n; ++i) {
246271
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
247272
}
248-
#endif
273+
#endif // GGML_SIMD
249274

250275
*s = sumf;
251276
}
@@ -292,6 +317,14 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
292317
for (; i + 3 < n; i += 4) {
293318
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
294319
}
320+
#elif defined(__riscv_v_intrinsic)
321+
for (int vl; i < n; i += vl) {
322+
vl = __riscv_vsetvl_e32m2(n - i);
323+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
324+
vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
325+
vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl);
326+
__riscv_vse32_v_f32m2(&y[i], vy, vl);
327+
}
295328
#endif
296329
for (; i < n; ++i) {
297330
y[i] = ggml_silu_f32(x[i]) * g[i];

ggml/src/ggml-cpu/vec.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,6 +1030,14 @@ inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
10301030
vl);
10311031
}
10321032

1033+
// computes silu x/(1+exp(-x)) in single precision vector
1034+
inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1035+
const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1036+
const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
1037+
const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
1038+
return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1039+
}
1040+
10331041
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
10341042

10351043
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {

0 commit comments

Comments
 (0)