@@ -119,6 +119,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
119119 }
120120
121121#if defined(GGML_SIMD )
122+ #if defined(__riscv_v_intrinsic )
123+ // todo: RVV impl
124+ for (int i = 0 ; i < n ; ++ i ) {
125+ for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
126+ sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
127+ }
128+ }
129+ #else
122130 const int np = (n & ~(GGML_F16_STEP - 1 ));
123131
124132 GGML_F16_VEC sum [GGML_VEC_DOT_UNROLL ][GGML_F16_ARR ] = { { GGML_F16_VEC_ZERO } };
@@ -149,6 +157,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
149157 sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
150158 }
151159 }
160+ #endif
152161#else
153162 for (int i = 0 ; i < n ; ++ i ) {
154163 for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
@@ -243,6 +252,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
243252
244253 svst1_f32 (pg , y + np2 , ay1 );
245254 }
255+ #elif defined(__riscv_v_intrinsic )
256+ for (int i = 0 , avl ; i < n ; i += avl ) {
257+ avl = __riscv_vsetvl_e32m8 (n - i );
258+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
259+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
260+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8 (ax , v , ay , avl );
261+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
262+ }
246263 #else
247264 const int np = (n & ~(GGML_F32_STEP - 1 ));
248265
@@ -276,6 +293,13 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
276293
277294inline static void ggml_vec_mad_f16 (const int n , ggml_fp16_t * GGML_RESTRICT y , const ggml_fp16_t * GGML_RESTRICT x , const float v ) {
278295#if defined(GGML_SIMD )
296+ #if defined(__riscv_v_intrinsic )
297+ // todo: RVV impl
298+ // scalar
299+ for (int i = 0 ; i < n ; ++ i ) {
300+ y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) + GGML_CPU_FP16_TO_FP32 (x [i ])* v );
301+ }
302+ #else
279303 const int np = (n & ~(GGML_F16_STEP - 1 ));
280304
281305 GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
@@ -297,6 +321,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
297321 for (int i = np ; i < n ; ++ i ) {
298322 y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) + GGML_CPU_FP16_TO_FP32 (x [i ])* v );
299323 }
324+ #endif
300325#else
301326 // scalar
302327 for (int i = 0 ; i < n ; ++ i ) {
@@ -324,6 +349,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
324349 y [i ] += x [k ][i ]* v [k ][0 ];
325350 }
326351 }
352+ #elif defined(__riscv_v_intrinsic )
353+ for (int i = 0 , avl ; i < n ; i += avl ) {
354+ avl = __riscv_vsetvl_e32m8 (n - i );
355+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
356+ for (int k = 0 ; k < GGML_VEC_MAD_UNROLL ; k ++ ) {
357+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [k ][i ], avl );
358+ ay = __riscv_vfmadd_vf_f32m8 (ax , v [k ][0 ], ay , avl );
359+ }
360+ __riscv_vse32_v_f32m8 (& y [i ], ay , avl );
361+ }
327362 #else
328363 const int np = (n & ~(GGML_F32_STEP - 1 ));
329364
@@ -375,6 +410,14 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
375410 for (int i = 0 ; i < n ; ++ i ) {
376411 y [i ] = x [i ]* s + b ;
377412 }
413+ #elif defined(__riscv_v_intrinsic )
414+ for (int i = 0 , avl ; i < n ; i += avl ) {
415+ avl = __riscv_vsetvl_e32m8 (n - i );
416+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
417+ vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8 (b , avl );
418+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8 (ax , s , vb , avl );
419+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
420+ }
378421 #else
379422 const int np = (n & ~(GGML_F32_STEP - 1 ));
380423
@@ -436,6 +479,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
436479 ay1 = svmul_f32_m (pg , ay1 , vx );
437480 svst1_f32 (pg , y + np , ay1 );
438481 }
482+ #elif defined(__riscv_v_intrinsic )
483+ for (int i = 0 , avl ; i < n ; i += avl ) {
484+ avl = __riscv_vsetvl_e32m8 (n - i );
485+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
486+ vfloat32m8_t ny = __riscv_vfmul_vf_f32m8 (ay , v , avl );
487+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
488+ }
439489 #else
440490 const int np = (n & ~(GGML_F32_STEP - 1 ));
441491
@@ -467,6 +517,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
467517
468518inline static void ggml_vec_scale_f16 (const int n , ggml_fp16_t * y , const float v ) {
469519#if defined(GGML_SIMD )
520+ #if defined(__riscv_v_intrinsic )
521+ // todo: RVV impl
522+ // scalar
523+ for (int i = 0 ; i < n ; ++ i ) {
524+ y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
525+ }
526+ #else
470527 const int np = (n & ~(GGML_F16_STEP - 1 ));
471528
472529 GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
@@ -486,6 +543,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
486543 for (int i = np ; i < n ; ++ i ) {
487544 y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
488545 }
546+ #endif
489547#else
490548 // scalar
491549 for (int i = 0 ; i < n ; ++ i ) {
@@ -928,7 +986,51 @@ inline static __m128 ggml_v_silu(__m128 x) {
928986 return _mm_div_ps (x , one_plus_exp_neg_x );
929987}
930988
931- #endif // __ARM_NEON / __AVX2__ / __SSE2__
989+ #elif defined(__riscv_v_intrinsic )
990+
991+ // adapted from arm limited optimized routine
992+ // the maximum error is 1.45358 plus 0.5 ulps
993+ // numbers above 88.38 will flush to infinity
994+ // numbers beneath -103.97 will flush to zero
995+ inline static vfloat32m2_t ggml_v_expf_m2 (vfloat32m2_t x , int vl ) {
996+ const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2 (0x1.8p23f , vl );
997+ #ifdef __riscv_xtheadvector
998+ // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
999+ vfloat32m2_t z = __riscv_vfadd_vf_f32m2 (r , 0.0f , vl );
1000+ z = __riscv_vfmacc_vf_f32m2 (z , 0x1.715476p+0f , x , vl );
1001+ #else
1002+ const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2 (r , 0x1.715476p+0f , x , vl );
1003+ #endif
1004+ const vfloat32m2_t n = __riscv_vfsub_vv_f32m2 (z , r , vl );
1005+ const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2 (__riscv_vfnmsac_vf_f32m2 (x , 0x1.62e4p-1f , n , vl ),
1006+ 0x1.7f7d1cp-20f , n , vl );
1007+ const vuint32m2_t e = __riscv_vsll_vx_u32m2 (__riscv_vreinterpret_v_f32m2_u32m2 (z ), 23 , vl );
1008+ const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2 (__riscv_vadd_vx_u32m2 (e , 0x3f800000 , vl )); // 1.0f
1009+ const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16 (__riscv_vfabs_v_f32m2 (n , vl ), 126.0f , vl );
1010+ const vfloat32m2_t u = __riscv_vfmul_vv_f32m2 (b , b , vl );
1011+ const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2 (
1012+ __riscv_vfmul_vf_f32m2 (b , 0x1.ffffecp-1f , vl ),
1013+ __riscv_vfmacc_vv_f32m2 (
1014+ __riscv_vfmacc_vf_f32m2 (__riscv_vfmv_v_f_f32m2 (0x1.fffdb6p-2f , vl ), 0x1.555e66p-3f , b , vl ),
1015+ __riscv_vfmacc_vf_f32m2 (__riscv_vfmv_v_f_f32m2 (0x1.573e2ep-5f , vl ), 0x1.0e4020p-7f , b , vl ),
1016+ u , vl ), u , vl );
1017+ if (!__riscv_vcpop_m_b16 (c , vl ))
1018+ return __riscv_vfmacc_vv_f32m2 (k , j , k , vl );
1019+ const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16 (n , 0.0f , vl );
1020+ const vuint32m2_t d = __riscv_vmerge_vxm_u32m2 (__riscv_vmv_v_x_u32m2 (0 , vl ), 0x82000000 , dm , vl );
1021+ const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2 (__riscv_vadd_vx_u32m2 (d , 0x7f000000 , vl ));
1022+ const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2 (__riscv_vsub_vv_u32m2 (e , d , vl ));
1023+ const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2 (
1024+ __riscv_vfmacc_vv_f32m2 (k , k , j , vl ),
1025+ __riscv_vfmul_vv_f32m2 (__riscv_vfmacc_vv_f32m2 (s2 , s2 , j , vl ), s1 , vl ),
1026+ c , vl );
1027+ return __riscv_vmerge_vvm_f32m2 (
1028+ r1 , __riscv_vfmul_vv_f32m2 (s1 , s1 , vl ),
1029+ __riscv_vmfgt_vf_f32m2_b16 (__riscv_vfabs_v_f32m2 (n , vl ), 192.0f , vl ),
1030+ vl );
1031+ }
1032+
1033+ #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
9321034
9331035inline static void ggml_vec_silu_f16 (const int n , ggml_fp16_t * y , const ggml_fp16_t * x ) {
9341036 for (int i = 0 ; i < n ; ++ i ) {
0 commit comments