@@ -214,7 +214,31 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
214214
215215 ggml_float sumf = 0.0 ;
216216
217- #if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
217+ #if defined(GGML_SIMD)
218+ #if defined(__riscv_v_intrinsic)
219+ #if defined(__riscv_zvfh)
220+ int vl = __riscv_vsetvlmax_e32m2 ();
221+ vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1 (0 .0f , 1 );
222+ vfloat32m2_t vsum;
223+ vfloat16m1_t ax;
224+ vfloat16m1_t ay;
225+ vsum = __riscv_vreinterpret_v_u32m2_f32m2 (__riscv_vmv_v_x_u32m2 (0 , vl));
226+ for (int i = 0 ; i < n; i += vl) {
227+ vl = __riscv_vsetvl_e16m1 (n - i);
228+ ax = __riscv_vle16_v_f16m1_tu (ax, (const _Float16 *)&x[i], vl);
229+ ay = __riscv_vle16_v_f16m1_tu (ay, (const _Float16 *)&y[i], vl);
230+ vsum = __riscv_vfwmacc_vv_f32m2_tu (vsum, ax, ay, vl);
231+ }
232+ vl = __riscv_vsetvlmax_e32m1 ();
233+ vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (vsum, 0 ), __riscv_vget_v_f32m2_f32m1 (vsum, 1 ), vl);
234+ vs = __riscv_vfredusum_vs_f32m1_f32m1 (ac0, vs, vl);
235+ sumf += __riscv_vfmv_f_s_f32m1_f32 (vs);
236+ #else
237+ for (int i = 0 ; i < n; ++i) {
238+ sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32 (x[i])*GGML_CPU_FP16_TO_FP32 (y[i]));
239+ }
240+ #endif // __riscv_zvfh
241+ #else
218242 const int np = (n & ~(GGML_F16_STEP - 1 ));
219243
220244 GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
@@ -241,11 +265,12 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
241265
242266 // if you hit this, you are likely running outside the FP range
243267 assert (!isnan (sumf) && !isinf (sumf));
268+ #endif // __riscv_v_intrinsic
244269#else
245270 for (int i = 0 ; i < n; ++i) {
246271 sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32 (x[i])*GGML_CPU_FP16_TO_FP32 (y[i]));
247272 }
248- #endif
273+ #endif // GGML_SIMD
249274
250275 *s = sumf;
251276}
@@ -292,6 +317,14 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
292317 for (; i + 3 < n; i += 4 ) {
293318 vst1q_f32 (y + i, vmulq_f32 (ggml_v_silu (vld1q_f32 (x + i)), vld1q_f32 (g + i)));
294319 }
320+ #elif defined(__riscv_v_intrinsic)
321+ for (int vl; i < n; i += vl) {
322+ vl = __riscv_vsetvl_e32m2 (n - i);
323+ vfloat32m2_t vx = __riscv_vle32_v_f32m2 (&x[i], vl);
324+ vfloat32m2_t vg = __riscv_vle32_v_f32m2 (&g[i], vl);
325+ vfloat32m2_t vy = __riscv_vfmul_vv_f32m2 (ggml_v_silu_m2 (vx, vl), vg, vl);
326+ __riscv_vse32_v_f32m2 (&y[i], vy, vl);
327+ }
295328#endif
296329 for (; i < n; ++i) {
297330 y[i] = ggml_silu_f32 (x[i]) * g[i];
0 commit comments