@@ -77,16 +77,85 @@ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp
7777        z [i ] =  GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (x [i ]) +  GGML_CPU_FP16_TO_FP32 (y [i ]));
7878    }
7979}
80- inline  static  void  ggml_vec_add1_f32 (const  int  n , float  *  z , const  float  *  x , const  float    v ) { for  (int  i  =  0 ; i  <  n ; ++ i ) z [i ]  =  x [i ] +  v ;    }
81- inline  static  void  ggml_vec_acc_f32  (const  int  n , float  *  y , const  float  *  x )                  { for  (int  i  =  0 ; i  <  n ; ++ i ) y [i ] +=  x [i ];        }
82- inline  static  void  ggml_vec_acc1_f32 (const  int  n , float  *  y , const  float    v )                  { for  (int  i  =  0 ; i  <  n ; ++ i ) y [i ] +=  v ;           }
80+ inline  static  void  ggml_vec_add1_f32 (const  int  n , float  *  z , const  float  *  x , const  float  v ) {
81+     int  i  =  0 ;
82+ #if  defined(GGML_SIMD )
83+     const  int  np  =  (n  &  ~(GGML_F32_STEP  -  1 ));
84+ 
85+     GGML_F32_VEC  vv  =  GGML_F32_VEC_SET1 (v );
86+ 
87+     for  (; i  <  np ; i  +=  GGML_F32_STEP ) {
88+         for  (int  j  =  0 ; j  <  GGML_F32_ARR ; ++ j ) {
89+             GGML_F32_VEC  ax  =  GGML_F32_VEC_LOAD (x  +  i  +  j * GGML_F32_EPR );
90+             GGML_F32_VEC  az  =  GGML_F32_VEC_ADD (ax , vv );
91+             GGML_F32_VEC_STORE (z  +  i  +  j * GGML_F32_EPR , az );
92+         }
93+     }
94+ #endif 
95+     for  (; i  <  n ; ++ i ) {
96+         z [i ] =  x [i ] +  v ;
97+     }
98+ }
99+ inline  static  void  ggml_vec_acc_f32  (const  int  n , float  *  y , const  float  *  x ) {
100+     int  i  =  0 ;
101+ #if  defined(GGML_SIMD )
102+     const  int  np  =  (n  &  ~(GGML_F32_STEP  -  1 ));
103+ 
104+     for  (; i  <  np ; i  +=  GGML_F32_STEP ) {
105+         for  (int  j  =  0 ; j  <  GGML_F32_ARR ; ++ j ) {
106+             GGML_F32_VEC  ay  =  GGML_F32_VEC_LOAD (y  +  i  +  j * GGML_F32_EPR );
107+             GGML_F32_VEC  ax  =  GGML_F32_VEC_LOAD (x  +  i  +  j * GGML_F32_EPR );
108+             ay  =  GGML_F32_VEC_ADD (ay , ax );
109+             GGML_F32_VEC_STORE (y  +  i  +  j * GGML_F32_EPR , ay );
110+         }
111+     }
112+ #endif 
113+     for  (; i  <  n ; ++ i ) {
114+         y [i ] +=  x [i ];
115+     }
116+ }
117+ inline  static  void  ggml_vec_acc1_f32 (const  int  n , float  *  y , const  float  v ) {
118+     int  i  =  0 ;
119+ #if  defined(GGML_SIMD )
120+     const  int  np  =  (n  &  ~(GGML_F32_STEP  -  1 ));
121+ 
122+     GGML_F32_VEC  vv  =  GGML_F32_VEC_SET1 (v );
123+ 
124+     for  (; i  <  np ; i  +=  GGML_F32_STEP ) {
125+         for  (int  j  =  0 ; j  <  GGML_F32_ARR ; ++ j ) {
126+             GGML_F32_VEC  ay  =  GGML_F32_VEC_LOAD (y  +  i  +  j * GGML_F32_EPR );
127+             ay  =  GGML_F32_VEC_ADD (ay , vv );
128+             GGML_F32_VEC_STORE (y  +  i  +  j * GGML_F32_EPR , ay );
129+         }
130+     }
131+ #endif 
132+     for  (; i  <  n ; ++ i ) {
133+         y [i ] +=  v ;
134+     }
135+ }
83136inline  static  void  ggml_vec_sub_f32  (const  int  n , float  *  z , const  float  *  x , const  float  *  y ) { for  (int  i  =  0 ; i  <  n ; ++ i ) z [i ]  =  x [i ] -  y [i ]; }
84137inline  static  void  ggml_vec_sub_f16  (const  int  n , ggml_fp16_t  *  z , const  ggml_fp16_t  *  x , const  ggml_fp16_t  *  y ) {
85138    for  (int  i  =  0 ; i  <  n ; ++ i ) {
86139        z [i ] =  GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (x [i ]) -  GGML_CPU_FP16_TO_FP32 (y [i ]));
87140    }
88141}
89- inline  static  void  ggml_vec_set_f32  (const  int  n , float  *  x , const  float    v )                  { for  (int  i  =  0 ; i  <  n ; ++ i ) x [i ]  =  v ;           }
142+ inline  static  void  ggml_vec_set_f32  (const  int  n , float  *  x , const  float  v ) {
143+     int  i  =  0 ;
144+ #if  defined(GGML_SIMD )
145+     const  int  np  =  (n  &  ~(GGML_F32_STEP  -  1 ));
146+ 
147+     GGML_F32_VEC  vx  =  GGML_F32_VEC_SET1 (v );
148+ 
149+     for  (; i  <  np ; i  +=  GGML_F32_STEP ) {
150+         for  (int  j  =  0 ; j  <  GGML_F32_ARR ; ++ j ) {
151+             GGML_F32_VEC_STORE (x  +  i  +  j * GGML_F32_EPR , vx );
152+         }
153+     }
154+ #endif 
155+     for  (; i  <  n ; ++ i ) {
156+         x [i ] =  v ;
157+     }
158+ }
90159inline  static  void  ggml_vec_cpy_f32  (const  int  n , float  *  y , const  float  *  x )                  { for  (int  i  =  0 ; i  <  n ; ++ i ) y [i ]  =  x [i ];        }
91160inline  static  void  ggml_vec_neg_f32  (const  int  n , float  *  y , const  float  *  x )                  { for  (int  i  =  0 ; i  <  n ; ++ i ) y [i ]  =  - x [i ];       }
92161inline  static  void  ggml_vec_neg_f16  (const  int  n , ggml_fp16_t  *  y , const  ggml_fp16_t  *  x ) {
@@ -95,7 +164,24 @@ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp
95164    }
96165}
97166
98- inline  static  void  ggml_vec_mul_f32  (const  int  n , float  *  z , const  float  *  x , const  float  *  y ) { for  (int  i  =  0 ; i  <  n ; ++ i ) z [i ]  =  x [i ]* y [i ];   }
167+ inline  static  void  ggml_vec_mul_f32  (const  int  n , float  *  z , const  float  *  x , const  float  *  y ) {
168+     int  i  =  0 ;
169+ #if  defined(GGML_SIMD )
170+     const  int  np  =  (n  &  ~(GGML_F32_STEP  -  1 ));
171+ 
172+     for  (; i  <  np ; i  +=  GGML_F32_STEP ) {
173+         for  (int  j  =  0 ; j  <  GGML_F32_ARR ; ++ j ) {
174+             GGML_F32_VEC  ax  =  GGML_F32_VEC_LOAD (x  +  i  +  j * GGML_F32_EPR );
175+             GGML_F32_VEC  ay  =  GGML_F32_VEC_LOAD (y  +  i  +  j * GGML_F32_EPR );
176+             GGML_F32_VEC  az  =  GGML_F32_VEC_MUL (ax , ay );
177+             GGML_F32_VEC_STORE (z  +  i  +  j * GGML_F32_EPR , az );
178+         }
179+     }
180+ #endif 
181+     for  (; i  <  n ; ++ i ) {
182+         z [i ] =  x [i ]* y [i ];
183+     }
184+ }
99185inline  static  void  ggml_vec_mul_f16  (const  int  n , ggml_fp16_t  *  z , const  ggml_fp16_t  *  x , const  ggml_fp16_t  *  y ) {
100186    for  (int  i  =  0 ; i  <  n ; ++ i ) {
101187        z [i ] =  GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (x [i ]) *  GGML_CPU_FP16_TO_FP32 (y [i ]));
0 commit comments