@@ -144,72 +144,72 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
144144 ay1 = GGML_F16x_VEC_LOAD (y + i + 0 * ggml_f16_epr , 0 ); // 8 elements
145145
146146 ax1 = GGML_F16x_VEC_LOAD (x [0 ] + i + 0 * ggml_f16_epr , 0 ); // 8 elemnst
147- sum_00 = GGML_F16x_VEC_FMA (ax1 , ay1 , sum_00 ); // sum_00 = sum_00+ax1*ay1
147+ sum_00 = GGML_F16x_VEC_FMA (sum_00 , ax1 , ay1 ); // sum_00 = sum_00+ax1*ay1
148148 ax1 = GGML_F16x_VEC_LOAD (x [1 ] + i + 0 * ggml_f16_epr , 0 ); // 8 elements
149- sum_10 = GGML_F16x_VEC_FMA (ax1 , ay1 , sum_10 );
149+ sum_10 = GGML_F16x_VEC_FMA (sum_10 , ax1 , ay1 );
150150
151151 ay2 = GGML_F16x_VEC_LOAD (y + i + 1 * ggml_f16_epr , 1 ); // next 8 elements
152152
153153 ax2 = GGML_F16x_VEC_LOAD (x [0 ] + i + 1 * ggml_f16_epr , 1 ); // next 8 ekements
154- sum_01 = GGML_F16x_VEC_FMA (ax2 , ay2 , sum_01 );
154+ sum_01 = GGML_F16x_VEC_FMA (sum_01 , ax2 , ay2 );
155155 ax2 = GGML_F16x_VEC_LOAD (x [1 ] + i + 1 * ggml_f16_epr , 1 );
156- sum_11 = GGML_F16x_VEC_FMA (ax2 , ay2 , sum_11 );
156+ sum_11 = GGML_F16x_VEC_FMA (sum_11 , ax2 , ay2 );
157157
158158 ay3 = GGML_F16x_VEC_LOAD (y + i + 2 * ggml_f16_epr , 2 );
159159
160160 ax3 = GGML_F16x_VEC_LOAD (x [0 ] + i + 2 * ggml_f16_epr , 2 );
161- sum_02 = GGML_F16x_VEC_FMA (ax3 , ay3 , sum_02 );
161+ sum_02 = GGML_F16x_VEC_FMA (sum_02 , ax3 , ay3 );
162162 ax1 = GGML_F16x_VEC_LOAD (x [1 ] + i + 2 * ggml_f16_epr , 2 );
163- sum_12 = GGML_F16x_VEC_FMA (ax3 , ay3 , sum_12 );
163+ sum_12 = GGML_F16x_VEC_FMA (sum_12 , ax3 , ay3 );
164164
165165 ay4 = GGML_F16x_VEC_LOAD (y + i + 3 * ggml_f16_epr , 3 );
166166
167167 ax4 = GGML_F16x_VEC_LOAD (x [0 ] + i + 3 * ggml_f16_epr , 3 );
168- sum_03 = GGML_F16x_VEC_FMA (ax4 , ay4 , sum_03 );
168+ sum_03 = GGML_F16x_VEC_FMA (sum_03 , ax4 , ay4 );
169169 ax4 = GGML_F16x_VEC_LOAD (x [1 ] + i + 3 * ggml_f16_epr , 3 );
170- sum_13 = GGML_F16x_VEC_FMA (ax4 , ay4 , sum_13 );
170+ sum_13 = GGML_F16x_VEC_FMA (sum_13 , ax4 , ay4 );
171171
172172 ay5 = GGML_F16x_VEC_LOAD (y + i + 4 * ggml_f16_epr , 4 );
173173
174174 ax5 = GGML_F16x_VEC_LOAD (x [0 ] + i + 4 * ggml_f16_epr , 4 );
175175
176- sum_00 = GGML_F16x_VEC_FMA (ax5 , ay5 , sum_00 );
176+ sum_00 = GGML_F16x_VEC_FMA (sum_00 , ax5 , ay5 );
177177 ax5 = GGML_F16x_VEC_LOAD (x [1 ] + i + 4 * ggml_f16_epr , 4 );
178- sum_10 = GGML_F16x_VEC_FMA (ax5 , ay5 , sum_10 );
178+ sum_10 = GGML_F16x_VEC_FMA (sum_10 , ax5 , ay5 );
179179
180180 ay6 = GGML_F16x_VEC_LOAD (y + i + 5 * ggml_f16_epr , 5 );
181181
182182 ax6 = GGML_F16x_VEC_LOAD (x [0 ] + i + 5 * ggml_f16_epr , 5 );
183183
184- sum_01 = GGML_F16x_VEC_FMA (ax6 , ay6 , sum_01 );
184+ sum_01 = GGML_F16x_VEC_FMA (sum_01 , ax6 , ay6 );
185185 ax6 = GGML_F16x_VEC_LOAD (x [1 ] + i + 5 * ggml_f16_epr , 5 );
186- sum_11 = GGML_F16x_VEC_FMA (ax6 , ay6 , sum_11 );
186+ sum_11 = GGML_F16x_VEC_FMA (sum_11 , ax6 , ay6 );
187187
188188 ay7 = GGML_F16x_VEC_LOAD (y + i + 6 * ggml_f16_epr , 6 );
189189
190190 ax7 = GGML_F16x_VEC_LOAD (x [0 ] + i + 6 * ggml_f16_epr , 6 );
191191
192- sum_02 = GGML_F16x_VEC_FMA (ax7 , ay7 , sum_02 );
192+ sum_02 = GGML_F16x_VEC_FMA (sum_02 , ax7 , ay7 );
193193 ax7 = GGML_F16x_VEC_LOAD (x [1 ] + i + 6 * ggml_f16_epr , 6 );
194- sum_12 = GGML_F16x_VEC_FMA (ax7 , ay7 , sum_12 );
194+ sum_12 = GGML_F16x_VEC_FMA (sum_12 , ax7 , ay7 );
195195
196196 ay8 = GGML_F16x_VEC_LOAD (y + i + 7 * ggml_f16_epr , 7 );
197197
198198 ax8 = GGML_F16x_VEC_LOAD (x [0 ] + i + 7 * ggml_f16_epr , 7 );
199199
200- sum_03 = GGML_F16x_VEC_FMA (ax8 , ay8 , sum_03 );
200+ sum_03 = GGML_F16x_VEC_FMA (sum_03 , ax8 , ay8 );
201201 ax8 = GGML_F16x_VEC_LOAD (x [1 ] + i + 7 * ggml_f16_epr , 7 );
202- sum_13 = GGML_F16x_VEC_FMA (ax8 , ay8 , sum_13 );
202+ sum_13 = GGML_F16x_VEC_FMA (sum_13 , ax8 , ay8 );
203203 }
204204
205205 const int np2 = (n & ~(ggml_f16_epr - 1 ));
206206 for (int k = np ; k < np2 ; k += ggml_f16_epr ) {
207207 svfloat16_t ry = GGML_F16x_VEC_LOAD (y + k , 0 );
208208
209209 svfloat16_t rx = GGML_F16x_VEC_LOAD (x [0 ] + k , 0 );
210- sum_00 = GGML_F16x_VEC_FMA (rx , ry , sum_00 );
210+ sum_00 = GGML_F16x_VEC_FMA (sum_00 , rx , ry );
211211 rx = GGML_F16x_VEC_LOAD (x [1 ] + k , 0 );
212- sum_10 = GGML_F16x_VEC_FMA (rx , ry , sum_10 );
212+ sum_10 = GGML_F16x_VEC_FMA (sum_10 , rx , ry );
213213 }
214214
215215 if (np2 < n ) {
@@ -396,66 +396,66 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
396396 for (int i = 0 ; i < np ; i += ggml_f16_step ) {
397397 ax1 = GGML_F16x_VEC_LOAD (x + i + 0 * ggml_f16_epr , 0 );
398398 ay1 = GGML_F16x_VEC_LOAD (y + i + 0 * ggml_f16_epr , 0 );
399- ay1 = GGML_F16x_VEC_FMA (ax1 , vx , ay1 );
399+ ay1 = GGML_F16x_VEC_FMA (ay1 , ax1 , vx );
400400
401401 GGML_F16x_VEC_STORE (y + i + 0 * ggml_f16_epr , ay1 , 0 );
402402
403403 ax2 = GGML_F16x_VEC_LOAD (x + i + 1 * ggml_f16_epr , 1 );
404404 ay2 = GGML_F16x_VEC_LOAD (y + i + 1 * ggml_f16_epr , 1 );
405- ay2 = GGML_F16x_VEC_FMA (ax2 , vx , ay2 );
405+ ay2 = GGML_F16x_VEC_FMA (ay2 , ax2 , vx );
406406
407407 GGML_F16x_VEC_STORE (y + i + 1 * ggml_f16_epr , ay2 , 1 );
408408
409409 ax3 = GGML_F16x_VEC_LOAD (x + i + 2 * ggml_f16_epr , 2 );
410410 ay3 = GGML_F16x_VEC_LOAD (y + i + 2 * ggml_f16_epr , 2 );
411- ay3 = GGML_F16x_VEC_FMA (ax3 , vx , ay3 );
411+ ay3 = GGML_F16x_VEC_FMA (ay3 , ax3 , vx );
412412
413413 GGML_F16x_VEC_STORE (y + i + 2 * ggml_f16_epr , ay3 , 2 );
414414
415415 ax4 = GGML_F16x_VEC_LOAD (x + i + 3 * ggml_f16_epr , 3 );
416416 ay4 = GGML_F16x_VEC_LOAD (y + i + 3 * ggml_f16_epr , 3 );
417- ay4 = GGML_F16x_VEC_FMA (ax4 , vx , ay4 );
417+ ay4 = GGML_F16x_VEC_FMA (ay4 , ax4 , vx );
418418
419419 GGML_F16x_VEC_STORE (y + i + 3 * ggml_f16_epr , ay4 , 3 );
420420
421421 ax5 = GGML_F16x_VEC_LOAD (x + i + 4 * ggml_f16_epr , 4 );
422422 ay5 = GGML_F16x_VEC_LOAD (y + i + 4 * ggml_f16_epr , 4 );
423- ay5 = GGML_F16x_VEC_FMA (ax5 , vx , ay5 );
423+ ay5 = GGML_F16x_VEC_FMA (ay5 , ax5 , vx );
424424
425425 GGML_F16x_VEC_STORE (y + i + 4 * ggml_f16_epr , ay5 , 4 );
426426
427427 ax6 = GGML_F16x_VEC_LOAD (x + i + 5 * ggml_f16_epr , 5 );
428428 ay6 = GGML_F16x_VEC_LOAD (y + i + 5 * ggml_f16_epr , 5 );
429- ay6 = GGML_F16x_VEC_FMA (ax6 , vx , ay6 );
429+ ay6 = GGML_F16x_VEC_FMA (ay6 , ax6 , vx );
430430
431431 GGML_F16x_VEC_STORE (y + i + 5 * ggml_f16_epr , ay6 , 5 );
432432
433433 ax7 = GGML_F16x_VEC_LOAD (x + i + 6 * ggml_f16_epr , 6 );
434434 ay7 = GGML_F16x_VEC_LOAD (y + i + 6 * ggml_f16_epr , 6 );
435- ay7 = GGML_F16x_VEC_FMA (ax7 , vx , ay7 );
435+ ay7 = GGML_F16x_VEC_FMA (ay7 , ax7 , vx );
436436
437437 GGML_F16x_VEC_STORE (y + i + 6 * ggml_f16_epr , ay7 , 6 );
438438
439439 ax8 = GGML_F16x_VEC_LOAD (x + i + 7 * ggml_f16_epr , 7 );
440440 ay8 = GGML_F16x_VEC_LOAD (y + i + 7 * ggml_f16_epr , 7 );
441- ay8 = GGML_F16x_VEC_FMA (ax8 , vx , ay8 );
441+ ay8 = GGML_F16x_VEC_FMA (ay8 , ax8 , vx );
442442
443443 GGML_F16x_VEC_STORE (y + i + 7 * ggml_f16_epr , ay8 , 7 );
444444 }
445445 const int np2 = (n & ~(ggml_f16_epr - 1 ));
446446 for (int k = np ; k < np2 ; k += ggml_f16_epr ) {
447447 svfloat16_t rx = GGML_F16x_VEC_LOAD (x + k , 0 );
448448 svfloat16_t ry = GGML_F16x_VEC_LOAD (y + k , 0 );
449- ry = GGML_F16x_VEC_FMA (rx , vx , ry );
449+ ry = GGML_F16x_VEC_FMA (ry , rx , vx );
450450
451451 GGML_F16x_VEC_STORE (y + k , ry , 0 );
452452 }
453453
454454 if (np2 < n ) {
455- svbool_t pg = svwhilelt_b16 (np2 , n );
455+ svbool_t pg = svwhilelt_b16 (np2 , n );
456456 svfloat16_t hx = svld1_f16 (pg , (const __fp16 * )(x + np2 ));
457457 svfloat16_t hy = svld1_f16 (pg , (const __fp16 * )(y + np2 ));
458- hy = svmad_f16_x (pg ,hx ,vx ,hy );
458+ hy = svmad_f16_x (pg , hx , vx , hy );
459459 svst1_f16 (pg , (__fp16 * )(y + np2 ), hy );
460460 }
461461 #else
@@ -674,7 +674,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
674674 if (np < n ) {
675675 svbool_t pg = svwhilelt_b16 (np , n );
676676 svfloat16_t hy = svld1_f16 (pg , (__fp16 * )(y + np ));
677- svfloat16_t out = svmul_f16_m ( pg , hy , vx );
677+ svfloat16_t out = svmul_f16_m (pg , hy , vx );
678678 svst1_f16 (pg , (__fp16 * )(y + np ), out );
679679 }
680680 #else
0 commit comments