@@ -215,7 +215,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
215215 ggml_float sumf = 0.0 ;
216216
217217
218- #if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
218+ #if defined(GGML_SIMD)
219219 #if defined(__ARM_FEATURE_SVE)
220220 const int sve_register_length = svcntb () * 8 ; // get vector length
221221 const int ggml_f16_epr = sve_register_length / 16 ; // running when 16
@@ -278,6 +278,29 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
278278 sum1 = svmad_f16_x (pg, hx, hy, sum1);
279279 }
280280 GGML_F16x_VEC_REDUCE (sumf, sum1, sum2, sum3, sum4);
281+ #elif defined(__riscv_v_intrinsic)
282+ #if defined(__riscv_zvfh)
283+ int vl = __riscv_vsetvlmax_e32m2 ();
284+ vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1 (0 .0f , 1 );
285+ vfloat32m2_t vsum;
286+ vfloat16m1_t ax;
287+ vfloat16m1_t ay;
288+ vsum = __riscv_vreinterpret_v_u32m2_f32m2 (__riscv_vmv_v_x_u32m2 (0 , vl));
289+ for (int i = 0 ; i < n; i += vl) {
290+ vl = __riscv_vsetvl_e16m1 (n - i);
291+ ax = __riscv_vle16_v_f16m1_tu (ax, (const _Float16 *)&x[i], vl);
292+ ay = __riscv_vle16_v_f16m1_tu (ay, (const _Float16 *)&y[i], vl);
293+ vsum = __riscv_vfwmacc_vv_f32m2_tu (vsum, ax, ay, vl);
294+ }
295+ vl = __riscv_vsetvlmax_e32m1 ();
296+ vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (vsum, 0 ), __riscv_vget_v_f32m2_f32m1 (vsum, 1 ), vl);
297+ vs = __riscv_vfredusum_vs_f32m1_f32m1 (ac0, vs, vl);
298+ sumf += __riscv_vfmv_f_s_f32m1_f32 (vs);
299+ #else
300+ for (int i = 0 ; i < n; ++i) {
301+ sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32 (x[i])*GGML_CPU_FP16_TO_FP32 (y[i]));
302+ }
303+ #endif // __riscv_zvfh
281304 #else
282305 const int np = (n & ~(GGML_F16_STEP - 1 ));
283306
@@ -309,7 +332,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
309332 for (int i = 0 ; i < n; ++i) {
310333 sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32 (x[i])*GGML_CPU_FP16_TO_FP32 (y[i]));
311334 }
312- #endif
335+ #endif // GGML_SIMD
313336
314337 *s = sumf;
315338}
@@ -368,6 +391,14 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
368391 for (; i + 3 < n; i += 4 ) {
369392 vst1q_f32 (y + i, vmulq_f32 (ggml_v_silu (vld1q_f32 (x + i)), vld1q_f32 (g + i)));
370393 }
394+ #elif defined(__riscv_v_intrinsic)
395+ for (int vl; i < n; i += vl) {
396+ vl = __riscv_vsetvl_e32m2 (n - i);
397+ vfloat32m2_t vx = __riscv_vle32_v_f32m2 (&x[i], vl);
398+ vfloat32m2_t vg = __riscv_vle32_v_f32m2 (&g[i], vl);
399+ vfloat32m2_t vy = __riscv_vfmul_vv_f32m2 (ggml_v_silu_m2 (vx, vl), vg, vl);
400+ __riscv_vse32_v_f32m2 (&y[i], vy, vl);
401+ }
371402#endif
372403 for (; i < n; ++i) {
373404 y[i] = ggml_silu_f32 (x[i]) * g[i];
0 commit comments