@@ -77,16 +77,85 @@ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp
7777 z [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (x [i ]) + GGML_CPU_FP16_TO_FP32 (y [i ]));
7878 }
7979}
80- inline static void ggml_vec_add1_f32 (const int n , float * z , const float * x , const float v ) { for (int i = 0 ; i < n ; ++ i ) z [i ] = x [i ] + v ; }
81- inline static void ggml_vec_acc_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] += x [i ]; }
82- inline static void ggml_vec_acc1_f32 (const int n , float * y , const float v ) { for (int i = 0 ; i < n ; ++ i ) y [i ] += v ; }
80+ inline static void ggml_vec_add1_f32 (const int n , float * z , const float * x , const float v ) {
81+ int i = 0 ;
82+ #if defined(GGML_SIMD )
83+ const int np = (n & ~(GGML_F32_STEP - 1 ));
84+
85+ GGML_F32_VEC vv = GGML_F32_VEC_SET1 (v );
86+
87+ for (; i < np ; i += GGML_F32_STEP ) {
88+ for (int j = 0 ; j < GGML_F32_ARR ; ++ j ) {
89+ GGML_F32_VEC ax = GGML_F32_VEC_LOAD (x + i + j * GGML_F32_EPR );
90+ GGML_F32_VEC az = GGML_F32_VEC_ADD (ax , vv );
91+ GGML_F32_VEC_STORE (z + i + j * GGML_F32_EPR , az );
92+ }
93+ }
94+ #endif
95+ for (; i < n ; ++ i ) {
96+ z [i ] = x [i ] + v ;
97+ }
98+ }
99+ inline static void ggml_vec_acc_f32 (const int n , float * y , const float * x ) {
100+ int i = 0 ;
101+ #if defined(GGML_SIMD )
102+ const int np = (n & ~(GGML_F32_STEP - 1 ));
103+
104+ for (; i < np ; i += GGML_F32_STEP ) {
105+ for (int j = 0 ; j < GGML_F32_ARR ; ++ j ) {
106+ GGML_F32_VEC ay = GGML_F32_VEC_LOAD (y + i + j * GGML_F32_EPR );
107+ GGML_F32_VEC ax = GGML_F32_VEC_LOAD (x + i + j * GGML_F32_EPR );
108+ ay = GGML_F32_VEC_ADD (ay , ax );
109+ GGML_F32_VEC_STORE (y + i + j * GGML_F32_EPR , ay );
110+ }
111+ }
112+ #endif
113+ for (; i < n ; ++ i ) {
114+ y [i ] += x [i ];
115+ }
116+ }
117+ inline static void ggml_vec_acc1_f32 (const int n , float * y , const float v ) {
118+ int i = 0 ;
119+ #if defined(GGML_SIMD )
120+ const int np = (n & ~(GGML_F32_STEP - 1 ));
121+
122+ GGML_F32_VEC vv = GGML_F32_VEC_SET1 (v );
123+
124+ for (; i < np ; i += GGML_F32_STEP ) {
125+ for (int j = 0 ; j < GGML_F32_ARR ; ++ j ) {
126+ GGML_F32_VEC ay = GGML_F32_VEC_LOAD (y + i + j * GGML_F32_EPR );
127+ ay = GGML_F32_VEC_ADD (ay , vv );
128+ GGML_F32_VEC_STORE (y + i + j * GGML_F32_EPR , ay );
129+ }
130+ }
131+ #endif
132+ for (; i < n ; ++ i ) {
133+ y [i ] += v ;
134+ }
135+ }
83136inline static void ggml_vec_sub_f32 (const int n , float * z , const float * x , const float * y ) { for (int i = 0 ; i < n ; ++ i ) z [i ] = x [i ] - y [i ]; }
84137inline static void ggml_vec_sub_f16 (const int n , ggml_fp16_t * z , const ggml_fp16_t * x , const ggml_fp16_t * y ) {
85138 for (int i = 0 ; i < n ; ++ i ) {
86139 z [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (x [i ]) - GGML_CPU_FP16_TO_FP32 (y [i ]));
87140 }
88141}
89- inline static void ggml_vec_set_f32 (const int n , float * x , const float v ) { for (int i = 0 ; i < n ; ++ i ) x [i ] = v ; }
142+ inline static void ggml_vec_set_f32 (const int n , float * x , const float v ) {
143+ int i = 0 ;
144+ #if defined(GGML_SIMD )
145+ const int np = (n & ~(GGML_F32_STEP - 1 ));
146+
147+ GGML_F32_VEC vx = GGML_F32_VEC_SET1 (v );
148+
149+ for (; i < np ; i += GGML_F32_STEP ) {
150+ for (int j = 0 ; j < GGML_F32_ARR ; ++ j ) {
151+ GGML_F32_VEC_STORE (x + i + j * GGML_F32_EPR , vx );
152+ }
153+ }
154+ #endif
155+ for (; i < n ; ++ i ) {
156+ x [i ] = v ;
157+ }
158+ }
90159inline static void ggml_vec_cpy_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = x [i ]; }
91160inline static void ggml_vec_neg_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = - x [i ]; }
92161inline static void ggml_vec_neg_f16 (const int n , ggml_fp16_t * y , const ggml_fp16_t * x ) {
@@ -95,7 +164,24 @@ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp
95164 }
96165}
97166
98- inline static void ggml_vec_mul_f32 (const int n , float * z , const float * x , const float * y ) { for (int i = 0 ; i < n ; ++ i ) z [i ] = x [i ]* y [i ]; }
167+ inline static void ggml_vec_mul_f32 (const int n , float * z , const float * x , const float * y ) {
168+ int i = 0 ;
169+ #if defined(GGML_SIMD )
170+ const int np = (n & ~(GGML_F32_STEP - 1 ));
171+
172+ for (; i < np ; i += GGML_F32_STEP ) {
173+ for (int j = 0 ; j < GGML_F32_ARR ; ++ j ) {
174+ GGML_F32_VEC ax = GGML_F32_VEC_LOAD (x + i + j * GGML_F32_EPR );
175+ GGML_F32_VEC ay = GGML_F32_VEC_LOAD (y + i + j * GGML_F32_EPR );
176+ GGML_F32_VEC az = GGML_F32_VEC_MUL (ax , ay );
177+ GGML_F32_VEC_STORE (z + i + j * GGML_F32_EPR , az );
178+ }
179+ }
180+ #endif
181+ for (; i < n ; ++ i ) {
182+ z [i ] = x [i ]* y [i ];
183+ }
184+ }
99185inline static void ggml_vec_mul_f16 (const int n , ggml_fp16_t * z , const ggml_fp16_t * x , const ggml_fp16_t * y ) {
100186 for (int i = 0 ; i < n ; ++ i ) {
101187 z [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (x [i ]) * GGML_CPU_FP16_TO_FP32 (y [i ]));
0 commit comments