@@ -737,7 +737,39 @@ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
737737 }
738738#endif
739739
740- #if defined(__ARM_NEON ) && defined(__aarch64__ )
740+ #if defined(__ARM_FEATURE_SVE ) && defined(__aarch64__ )
741+
742+ inline static svfloat32_t ggml_v_expf (svbool_t pg , svfloat32_t x ) {
743+ const svfloat32_t r = svdup_n_f32_x (pg , 0x1.8p23f );
744+ const svfloat32_t z = svmla_n_f32_x (pg , r , x , 0x1.715476p+0f );
745+ const svfloat32_t n = svsub_f32_x (pg , z , r );
746+ const svfloat32_t b = svmls_n_f32_x (pg , svmls_n_f32_x (pg , x , n , 0x1.62e4p-1f ), n , 0x1.7f7d1cp-20f );
747+ const svuint32_t e = svlsl_n_u32_x (pg , svreinterpret_u32_f32 (z ), 23 );
748+ const svfloat32_t k = svreinterpret_f32_u32 (svadd_u32_x (pg , e , svreinterpret_u32_f32 (svdup_n_f32_x (pg , 1 ))));
749+ const svbool_t c = svacgt_n_f32 (pg , n , 126 );
750+ const svfloat32_t u = svmul_f32_x (pg , b , b );
751+ const svfloat32_t j = svmla_f32_x (pg ,
752+ svmul_n_f32_x (pg , b , 0x1.ffffecp-1f ),
753+ svmla_f32_x (pg , svmla_f32_x (pg , svdup_n_f32_x (pg , 0x1.fffdb6p-2f ), svdup_n_f32_x (pg , 0x1.555e66p-3f ), b ),
754+ svmla_f32_x (pg , svdup_n_f32_x (pg , 0x1.573e2ep-5f ), svdup_n_f32_x (pg , 0x1.0e4020p-7f ), b ), u ), u );
755+ const svuint32_t d = svdup_n_u32_z (svcmple_n_f32 (pg , n , 0.0 ), 0x82000000 );
756+ const svfloat32_t s1 = svreinterpret_f32_u32 (svadd_n_u32_x (pg , d , 0x7f000000 ));
757+ const svfloat32_t s2 = svreinterpret_f32_u32 (svsub_u32_x (pg , e , d ));
758+ return svsel_f32 (svacgt_f32 (pg , n , svdup_n_f32_x (pg , 192 )), svmul_f32_x (pg , s1 , s1 ),
759+ svsel_f32 (c , svmul_f32_x (pg , svmla_f32_x (pg , s2 , s2 , j ), s1 ), svmla_f32_x (pg , k , k , j )));
760+ }
761+
762+ // computes silu x/(1+exp(-x)) in single precision vector
763+ inline static svfloat32_t ggml_v_silu (svbool_t pg , svfloat32_t x ) {
764+ const svfloat32_t one = svdup_n_f32_x (pg , 1.0f );
765+ const svfloat32_t zero = svdup_n_f32_x (pg , 0.0f );
766+ const svfloat32_t neg_x = svsub_f32_x (pg , zero , x );
767+ const svfloat32_t exp_neg_x = ggml_v_expf (pg , neg_x );
768+ const svfloat32_t one_plus_exp_neg_x = svadd_f32_x (pg , one , exp_neg_x );
769+ return svdiv_f32_x (pg , x , one_plus_exp_neg_x );
770+ }
771+
772+ #elif defined(__ARM_NEON ) && defined(__aarch64__ )
741773
742774// adapted from arm limited optimized routine
743775// the maximum error is 1.45358 plus 0.5 ulps
0 commit comments