Skip to content

Commit 0aca430

Browse files
committed
changed GGML_F16x_VEC_FMA for code consistency
1 parent 15e0c79 commit 0aca430

File tree

3 files changed

+41
-41
lines changed

3 files changed

+41
-41
lines changed

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
230230
#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
231231
#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
232232

233-
#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, a, b, c)
233+
#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
234234
#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
235235
#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
236236
#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)

ggml/src/ggml-cpu/vec.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -214,46 +214,46 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
214214
for (int i = 0; i < np; i += ggml_f16_step) {
215215
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
216216
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
217-
sum1 = GGML_F16x_VEC_FMA(ax1, ay1, sum1);
217+
sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
218218

219219
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
220220
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
221-
sum2 = GGML_F16x_VEC_FMA(ax2, ay2, sum2);
221+
sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
222222

223223
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
224224
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
225-
sum3 = GGML_F16x_VEC_FMA(ax3, ay3, sum3);
225+
sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
226226

227227
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
228228
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
229-
sum4 = GGML_F16x_VEC_FMA(ax4, ay4, sum4);
229+
sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
230230

231231
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
232232
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
233-
sum1 = GGML_F16x_VEC_FMA(ax5, ay5, sum1);
233+
sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
234234

235235
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
236236
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
237-
sum2 = GGML_F16x_VEC_FMA(ax6, ay6, sum2);
237+
sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
238238

239239
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
240240
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
241-
sum3 = GGML_F16x_VEC_FMA(ax7, ay7, sum3);
241+
sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
242242

243243
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
244244
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
245-
sum4 = GGML_F16x_VEC_FMA(ax8, ay8, sum4);
245+
sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
246246
}
247247

248248
const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
249249
for (int k = np; k < np2; k += ggml_f16_epr) {
250250
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
251251
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
252-
sum1 = GGML_F16x_VEC_FMA(rx, ry, sum1);
252+
sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
253253
}
254254

255255
if (np2 < n) {
256-
svbool_t pg = svwhilelt_b16(np2,n);
256+
svbool_t pg = svwhilelt_b16(np2, n);
257257
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
258258
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
259259

ggml/src/ggml-cpu/vec.h

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -144,72 +144,72 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
144144
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
145145

146146
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
147-
sum_00 = GGML_F16x_VEC_FMA(ax1, ay1, sum_00); // sum_00 = sum_00+ax1*ay1
147+
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
148148
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
149-
sum_10 = GGML_F16x_VEC_FMA(ax1, ay1, sum_10);
149+
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
150150

151151
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
152152

153153
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
154-
sum_01 = GGML_F16x_VEC_FMA(ax2, ay2, sum_01);
154+
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
155155
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
156-
sum_11 = GGML_F16x_VEC_FMA(ax2, ay2,sum_11);
156+
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
157157

158158
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
159159

160160
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
161-
sum_02 = GGML_F16x_VEC_FMA(ax3, ay3, sum_02);
161+
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
162162
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
163-
sum_12 = GGML_F16x_VEC_FMA(ax3, ay3, sum_12);
163+
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
164164

165165
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
166166

167167
ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
168-
sum_03 = GGML_F16x_VEC_FMA(ax4, ay4, sum_03);
168+
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
169169
ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
170-
sum_13 = GGML_F16x_VEC_FMA(ax4, ay4, sum_13);
170+
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
171171

172172
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
173173

174174
ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
175175

176-
sum_00 = GGML_F16x_VEC_FMA(ax5, ay5, sum_00);
176+
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
177177
ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
178-
sum_10 = GGML_F16x_VEC_FMA(ax5, ay5, sum_10);
178+
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
179179

180180
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
181181

182182
ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
183183

184-
sum_01 = GGML_F16x_VEC_FMA(ax6, ay6, sum_01);
184+
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
185185
ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
186-
sum_11 = GGML_F16x_VEC_FMA(ax6, ay6,sum_11);
186+
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
187187

188188
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
189189

190190
ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
191191

192-
sum_02 = GGML_F16x_VEC_FMA(ax7, ay7, sum_02);
192+
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
193193
ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
194-
sum_12 = GGML_F16x_VEC_FMA(ax7, ay7, sum_12);
194+
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
195195

196196
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
197197

198198
ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
199199

200-
sum_03 = GGML_F16x_VEC_FMA(ax8, ay8, sum_03);
200+
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
201201
ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
202-
sum_13 = GGML_F16x_VEC_FMA(ax8, ay8, sum_13);
202+
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
203203
}
204204

205205
const int np2 = (n & ~(ggml_f16_epr - 1));
206206
for (int k = np; k < np2; k += ggml_f16_epr) {
207207
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
208208

209209
svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
210-
sum_00 = GGML_F16x_VEC_FMA(rx, ry, sum_00);
210+
sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
211211
rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
212-
sum_10 = GGML_F16x_VEC_FMA(rx, ry, sum_10);
212+
sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
213213
}
214214

215215
if (np2 < n) {
@@ -396,66 +396,66 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
396396
for (int i = 0; i < np; i += ggml_f16_step) {
397397
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
398398
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
399-
ay1 = GGML_F16x_VEC_FMA(ax1, vx, ay1);
399+
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
400400

401401
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
402402

403403
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
404404
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
405-
ay2 = GGML_F16x_VEC_FMA(ax2, vx, ay2);
405+
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
406406

407407
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
408408

409409
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
410410
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
411-
ay3 = GGML_F16x_VEC_FMA(ax3, vx, ay3);
411+
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
412412

413413
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
414414

415415
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
416416
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
417-
ay4 = GGML_F16x_VEC_FMA(ax4, vx, ay4);
417+
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
418418

419419
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
420420

421421
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
422422
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
423-
ay5 = GGML_F16x_VEC_FMA(ax5, vx, ay5);
423+
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
424424

425425
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
426426

427427
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
428428
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
429-
ay6 = GGML_F16x_VEC_FMA(ax6, vx, ay6);
429+
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
430430

431431
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
432432

433433
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
434434
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
435-
ay7 = GGML_F16x_VEC_FMA(ax7, vx, ay7);
435+
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
436436

437437
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
438438

439439
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
440440
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
441-
ay8 = GGML_F16x_VEC_FMA(ax8, vx, ay8);
441+
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
442442

443443
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
444444
}
445445
const int np2 = (n & ~(ggml_f16_epr - 1));
446446
for (int k = np; k < np2; k += ggml_f16_epr) {
447447
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
448448
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
449-
ry = GGML_F16x_VEC_FMA(rx, vx, ry);
449+
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
450450

451451
GGML_F16x_VEC_STORE(y + k, ry, 0);
452452
}
453453

454454
if (np2 < n) {
455-
svbool_t pg =svwhilelt_b16(np2, n);
455+
svbool_t pg = svwhilelt_b16(np2, n);
456456
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
457457
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
458-
hy = svmad_f16_x(pg,hx,vx,hy);
458+
hy = svmad_f16_x(pg, hx, vx, hy);
459459
svst1_f16(pg, (__fp16 *)(y + np2), hy);
460460
}
461461
#else
@@ -674,7 +674,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
674674
if (np < n) {
675675
svbool_t pg = svwhilelt_b16(np, n);
676676
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
677-
svfloat16_t out = svmul_f16_m( pg, hy, vx );
677+
svfloat16_t out = svmul_f16_m(pg, hy, vx);
678678
svst1_f16(pg, (__fp16 *)(y + np), out);
679679
}
680680
#else

0 commit comments

Comments
 (0)