@@ -119,6 +119,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
119
119
}
120
120
121
121
#if defined(GGML_SIMD )
122
+ #if defined(__riscv_v_intrinsic )
123
+ // todo: RVV impl
124
+ for (int i = 0 ; i < n ; ++ i ) {
125
+ for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
126
+ sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
127
+ }
128
+ }
129
+ #else
122
130
const int np = (n & ~(GGML_F16_STEP - 1 ));
123
131
124
132
GGML_F16_VEC sum [GGML_VEC_DOT_UNROLL ][GGML_F16_ARR ] = { { GGML_F16_VEC_ZERO } };
@@ -149,6 +157,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
149
157
sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
150
158
}
151
159
}
160
+ #endif
152
161
#else
153
162
for (int i = 0 ; i < n ; ++ i ) {
154
163
for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
@@ -243,6 +252,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
243
252
244
253
svst1_f32 (pg , y + np2 , ay1 );
245
254
}
255
+ #elif defined(__riscv_v_intrinsic )
256
+ for (int i = 0 , avl ; i < n ; i += avl ) {
257
+ avl = __riscv_vsetvl_e32m8 (n - i );
258
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
259
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
260
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8 (ax , v , ay , avl );
261
+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
262
+ }
246
263
#else
247
264
const int np = (n & ~(GGML_F32_STEP - 1 ));
248
265
@@ -276,6 +293,13 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
276
293
277
294
inline static void ggml_vec_mad_f16 (const int n , ggml_fp16_t * GGML_RESTRICT y , const ggml_fp16_t * GGML_RESTRICT x , const float v ) {
278
295
#if defined(GGML_SIMD )
296
+ #if defined(__riscv_v_intrinsic )
297
+ // todo: RVV impl
298
+ // scalar
299
+ for (int i = 0 ; i < n ; ++ i ) {
300
+ y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) + GGML_CPU_FP16_TO_FP32 (x [i ])* v );
301
+ }
302
+ #else
279
303
const int np = (n & ~(GGML_F16_STEP - 1 ));
280
304
281
305
GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
@@ -297,6 +321,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
297
321
for (int i = np ; i < n ; ++ i ) {
298
322
y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) + GGML_CPU_FP16_TO_FP32 (x [i ])* v );
299
323
}
324
+ #endif
300
325
#else
301
326
// scalar
302
327
for (int i = 0 ; i < n ; ++ i ) {
@@ -324,6 +349,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
324
349
y [i ] += x [k ][i ]* v [k ][0 ];
325
350
}
326
351
}
352
+ #elif defined(__riscv_v_intrinsic )
353
+ for (int i = 0 , avl ; i < n ; i += avl ) {
354
+ avl = __riscv_vsetvl_e32m8 (n - i );
355
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
356
+ for (int k = 0 ; k < GGML_VEC_MAD_UNROLL ; k ++ ) {
357
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [k ][i ], avl );
358
+ ay = __riscv_vfmadd_vf_f32m8 (ax , v [k ][0 ], ay , avl );
359
+ }
360
+ __riscv_vse32_v_f32m8 (& y [i ], ay , avl );
361
+ }
327
362
#else
328
363
const int np = (n & ~(GGML_F32_STEP - 1 ));
329
364
@@ -375,6 +410,14 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
375
410
for (int i = 0 ; i < n ; ++ i ) {
376
411
y [i ] = x [i ]* s + b ;
377
412
}
413
+ #elif defined(__riscv_v_intrinsic )
414
+ for (int i = 0 , avl ; i < n ; i += avl ) {
415
+ avl = __riscv_vsetvl_e32m8 (n - i );
416
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
417
+ vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8 (b , avl );
418
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8 (ax , s , vb , avl );
419
+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
420
+ }
378
421
#else
379
422
const int np = (n & ~(GGML_F32_STEP - 1 ));
380
423
@@ -436,6 +479,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
436
479
ay1 = svmul_f32_m (pg , ay1 , vx );
437
480
svst1_f32 (pg , y + np , ay1 );
438
481
}
482
+ #elif defined(__riscv_v_intrinsic )
483
+ for (int i = 0 , avl ; i < n ; i += avl ) {
484
+ avl = __riscv_vsetvl_e32m8 (n - i );
485
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
486
+ vfloat32m8_t ny = __riscv_vfmul_vf_f32m8 (ay , v , avl );
487
+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
488
+ }
439
489
#else
440
490
const int np = (n & ~(GGML_F32_STEP - 1 ));
441
491
@@ -467,6 +517,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
467
517
468
518
inline static void ggml_vec_scale_f16 (const int n , ggml_fp16_t * y , const float v ) {
469
519
#if defined(GGML_SIMD )
520
+ #if defined(__riscv_v_intrinsic )
521
+ // todo: RVV impl
522
+ // scalar
523
+ for (int i = 0 ; i < n ; ++ i ) {
524
+ y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
525
+ }
526
+ #else
470
527
const int np = (n & ~(GGML_F16_STEP - 1 ));
471
528
472
529
GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
@@ -486,6 +543,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
486
543
for (int i = np ; i < n ; ++ i ) {
487
544
y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
488
545
}
546
+ #endif
489
547
#else
490
548
// scalar
491
549
for (int i = 0 ; i < n ; ++ i ) {
@@ -928,7 +986,51 @@ inline static __m128 ggml_v_silu(__m128 x) {
928
986
return _mm_div_ps (x , one_plus_exp_neg_x );
929
987
}
930
988
931
- #endif // __ARM_NEON / __AVX2__ / __SSE2__
989
+ #elif defined(__riscv_v_intrinsic )
990
+
991
+ // adapted from arm limited optimized routine
992
+ // the maximum error is 1.45358 plus 0.5 ulps
993
+ // numbers above 88.38 will flush to infinity
994
+ // numbers beneath -103.97 will flush to zero
995
+ inline static vfloat32m2_t ggml_v_expf_m2 (vfloat32m2_t x , int vl ) {
996
+ const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2 (0x1.8p23f , vl );
997
+ #ifdef __riscv_xtheadvector
998
+ // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
999
+ vfloat32m2_t z = __riscv_vfadd_vf_f32m2 (r , 0.0f , vl );
1000
+ z = __riscv_vfmacc_vf_f32m2 (z , 0x1.715476p+0f , x , vl );
1001
+ #else
1002
+ const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2 (r , 0x1.715476p+0f , x , vl );
1003
+ #endif
1004
+ const vfloat32m2_t n = __riscv_vfsub_vv_f32m2 (z , r , vl );
1005
+ const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2 (__riscv_vfnmsac_vf_f32m2 (x , 0x1.62e4p-1f , n , vl ),
1006
+ 0x1.7f7d1cp-20f , n , vl );
1007
+ const vuint32m2_t e = __riscv_vsll_vx_u32m2 (__riscv_vreinterpret_v_f32m2_u32m2 (z ), 23 , vl );
1008
+ const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2 (__riscv_vadd_vx_u32m2 (e , 0x3f800000 , vl )); // 1.0f
1009
+ const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16 (__riscv_vfabs_v_f32m2 (n , vl ), 126.0f , vl );
1010
+ const vfloat32m2_t u = __riscv_vfmul_vv_f32m2 (b , b , vl );
1011
+ const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2 (
1012
+ __riscv_vfmul_vf_f32m2 (b , 0x1.ffffecp-1f , vl ),
1013
+ __riscv_vfmacc_vv_f32m2 (
1014
+ __riscv_vfmacc_vf_f32m2 (__riscv_vfmv_v_f_f32m2 (0x1.fffdb6p-2f , vl ), 0x1.555e66p-3f , b , vl ),
1015
+ __riscv_vfmacc_vf_f32m2 (__riscv_vfmv_v_f_f32m2 (0x1.573e2ep-5f , vl ), 0x1.0e4020p-7f , b , vl ),
1016
+ u , vl ), u , vl );
1017
+ if (!__riscv_vcpop_m_b16 (c , vl ))
1018
+ return __riscv_vfmacc_vv_f32m2 (k , j , k , vl );
1019
+ const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16 (n , 0.0f , vl );
1020
+ const vuint32m2_t d = __riscv_vmerge_vxm_u32m2 (__riscv_vmv_v_x_u32m2 (0 , vl ), 0x82000000 , dm , vl );
1021
+ const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2 (__riscv_vadd_vx_u32m2 (d , 0x7f000000 , vl ));
1022
+ const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2 (__riscv_vsub_vv_u32m2 (e , d , vl ));
1023
+ const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2 (
1024
+ __riscv_vfmacc_vv_f32m2 (k , k , j , vl ),
1025
+ __riscv_vfmul_vv_f32m2 (__riscv_vfmacc_vv_f32m2 (s2 , s2 , j , vl ), s1 , vl ),
1026
+ c , vl );
1027
+ return __riscv_vmerge_vvm_f32m2 (
1028
+ r1 , __riscv_vfmul_vv_f32m2 (s1 , s1 , vl ),
1029
+ __riscv_vmfgt_vf_f32m2_b16 (__riscv_vfabs_v_f32m2 (n , vl ), 192.0f , vl ),
1030
+ vl );
1031
+ }
1032
+
1033
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
932
1034
933
1035
inline static void ggml_vec_silu_f16 (const int n , ggml_fp16_t * y , const ggml_fp16_t * x ) {
934
1036
for (int i = 0 ; i < n ; ++ i ) {
0 commit comments