@@ -44,6 +44,7 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
4444void ggml_vec_dot_f16 (int n , float * GGML_RESTRICT s , size_t bs , ggml_fp16_t * GGML_RESTRICT x , size_t bx , ggml_fp16_t * GGML_RESTRICT y , size_t by , int nrc );
4545
4646void ggml_vec_silu_f32 (const int n , float * y , const float * x );
47+ ggml_float ggml_vec_cvar_f32 (const int n , float * y , const float * x , const float mean ); //it will also center y ( y = y - mean )
4748ggml_float ggml_vec_soft_max_f32 (const int n , float * y , const float * x , float max );
4849ggml_float ggml_vec_log_soft_max_f32 (const int n , float * y , const float * x , float max );
4950
@@ -143,14 +144,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
143144 for (int i = 0 ; i < np ; i += ggml_f16_step ) {
144145 ay1 = GGML_F16x_VEC_LOAD (y + i + 0 * ggml_f16_epr , 0 ); // 8 elements
145146
146- ax1 = GGML_F16x_VEC_LOAD (x [0 ] + i + 0 * ggml_f16_epr , 0 ); // 8 elemnst
147+ ax1 = GGML_F16x_VEC_LOAD (x [0 ] + i + 0 * ggml_f16_epr , 0 ); // 8 elements
147148 sum_00 = GGML_F16x_VEC_FMA (sum_00 , ax1 , ay1 ); // sum_00 = sum_00+ax1*ay1
148149 ax1 = GGML_F16x_VEC_LOAD (x [1 ] + i + 0 * ggml_f16_epr , 0 ); // 8 elements
149150 sum_10 = GGML_F16x_VEC_FMA (sum_10 , ax1 , ay1 );
150151
151152 ay2 = GGML_F16x_VEC_LOAD (y + i + 1 * ggml_f16_epr , 1 ); // next 8 elements
152153
153- ax2 = GGML_F16x_VEC_LOAD (x [0 ] + i + 1 * ggml_f16_epr , 1 ); // next 8 ekements
154+ ax2 = GGML_F16x_VEC_LOAD (x [0 ] + i + 1 * ggml_f16_epr , 1 ); // next 8 elements
154155 sum_01 = GGML_F16x_VEC_FMA (sum_01 , ax2 , ay2 );
155156 ax2 = GGML_F16x_VEC_LOAD (x [1 ] + i + 1 * ggml_f16_epr , 1 );
156157 sum_11 = GGML_F16x_VEC_FMA (sum_11 , ax2 , ay2 );
@@ -159,7 +160,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
159160
160161 ax3 = GGML_F16x_VEC_LOAD (x [0 ] + i + 2 * ggml_f16_epr , 2 );
161162 sum_02 = GGML_F16x_VEC_FMA (sum_02 , ax3 , ay3 );
162- ax1 = GGML_F16x_VEC_LOAD (x [1 ] + i + 2 * ggml_f16_epr , 2 );
163+ ax3 = GGML_F16x_VEC_LOAD (x [1 ] + i + 2 * ggml_f16_epr , 2 );
163164 sum_12 = GGML_F16x_VEC_FMA (sum_12 , ax3 , ay3 );
164165
165166 ay4 = GGML_F16x_VEC_LOAD (y + i + 3 * ggml_f16_epr , 3 );
@@ -819,7 +820,8 @@ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_f
819820inline static void ggml_vec_elu_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = (x [i ] > 0.f ) ? x [i ] : expm1f (x [i ]); }
820821inline static void ggml_vec_elu_f16 (const int n , ggml_fp16_t * y , const ggml_fp16_t * x ) {
821822 for (int i = 0 ; i < n ; ++ i ) {
822- y [i ] = GGML_CPU_FP32_TO_FP16 (expm1f (GGML_CPU_FP16_TO_FP32 (x [i ])));
823+ const float v = GGML_CPU_FP16_TO_FP32 (x [i ]);
824+ y [i ] = GGML_CPU_FP32_TO_FP16 ((v > 0.f ) ? v : expm1f (v ));
823825 }
824826}
825827inline static void ggml_vec_relu_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = (x [i ] > 0.f ) ? x [i ] : 0.f ; }
0 commit comments