@@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
163163
164164 ax1 = GGML_F32_VEC_LOAD (x + i );
165165 ay1 = GGML_F32_VEC_LOAD (y + i );
166- ay1 = GGML_F32_VEC_FMA (ax1 , vx , ay1 );
166+ ay1 = GGML_F32_VEC_FMA (ay1 , ax1 , vx );
167167
168168 GGML_F32_VEC_STORE (y + i , ay1 );
169169
170170 ax2 = GGML_F32_VEC_LOAD (x + i + 1 * ggml_f32_epr );
171171 ay2 = GGML_F32_VEC_LOAD (y + i + 1 * ggml_f32_epr );
172- ay2 = GGML_F32_VEC_FMA (ax2 , vx , ay2 );
172+ ay2 = GGML_F32_VEC_FMA (ay2 , ax2 , vx );
173173
174174 GGML_F32_VEC_STORE (y + i + 1 * ggml_f32_epr , ay2 );
175175
176176 ax3 = GGML_F32_VEC_LOAD (x + i + 2 * ggml_f32_epr );
177177 ay3 = GGML_F32_VEC_LOAD (y + i + 2 * ggml_f32_epr );
178- ay3 = GGML_F32_VEC_FMA (ax3 , vx , ay3 );
178+ ay3 = GGML_F32_VEC_FMA (ay3 , ax3 , vx );
179179
180180 GGML_F32_VEC_STORE (y + i + 2 * ggml_f32_epr , ay3 );
181181
182182 ax4 = GGML_F32_VEC_LOAD (x + i + 3 * ggml_f32_epr );
183183 ay4 = GGML_F32_VEC_LOAD (y + i + 3 * ggml_f32_epr );
184- ay4 = GGML_F32_VEC_FMA (ax4 , vx , ay4 );
184+ ay4 = GGML_F32_VEC_FMA (ay4 , ax4 , vx );
185185
186186 GGML_F32_VEC_STORE (y + i + 3 * ggml_f32_epr , ay4 );
187187
188188 ax5 = GGML_F32_VEC_LOAD (x + i + 4 * ggml_f32_epr );
189189 ay5 = GGML_F32_VEC_LOAD (y + i + 4 * ggml_f32_epr );
190- ay5 = GGML_F32_VEC_FMA (ax5 , vx , ay5 );
190+ ay5 = GGML_F32_VEC_FMA (ay5 , ax5 , vx );
191191
192192 GGML_F32_VEC_STORE (y + i + 4 * ggml_f32_epr , ay5 );
193193
194194 ax6 = GGML_F32_VEC_LOAD (x + i + 5 * ggml_f32_epr );
195195 ay6 = GGML_F32_VEC_LOAD (y + i + 5 * ggml_f32_epr );
196- ay6 = GGML_F32_VEC_FMA (ax6 , vx , ay6 );
196+ ay6 = GGML_F32_VEC_FMA (ay6 , ax6 , vx );
197197
198198 GGML_F32_VEC_STORE (y + i + 5 * ggml_f32_epr , ay6 );
199199
200200 ax7 = GGML_F32_VEC_LOAD (x + i + 6 * ggml_f32_epr );
201201 ay7 = GGML_F32_VEC_LOAD (y + i + 6 * ggml_f32_epr );
202- ay7 = GGML_F32_VEC_FMA (ax7 , vx , ay7 );
202+ ay7 = GGML_F32_VEC_FMA (ay7 , ax7 , vx );
203203
204204 GGML_F32_VEC_STORE (y + i + 6 * ggml_f32_epr , ay7 );
205205
206206 ax8 = GGML_F32_VEC_LOAD (x + i + 7 * ggml_f32_epr );
207207 ay8 = GGML_F32_VEC_LOAD (y + i + 7 * ggml_f32_epr );
208- ay8 = GGML_F32_VEC_FMA (ax8 , vx , ay8 );
208+ ay8 = GGML_F32_VEC_FMA (ay8 , ax8 , vx );
209209
210210 GGML_F32_VEC_STORE (y + i + 7 * ggml_f32_epr , ay8 );
211211 }
@@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
215215 for (int i = np ; i < np2 ; i += ggml_f32_epr ) {
216216 ax1 = GGML_F32_VEC_LOAD (x + i );
217217 ay1 = GGML_F32_VEC_LOAD (y + i );
218- ay1 = GGML_F32_VEC_FMA (ax1 , vx , ay1 );
218+ ay1 = GGML_F32_VEC_FMA (ay1 , ax1 , vx );
219219
220220 GGML_F32_VEC_STORE (y + i , ay1 );
221221 }
0 commit comments