Skip to content

Commit 59e01b1

Browse files
authored
Merge pull request #2799 from RajalakshmiSR/p10_ger
POWER10: Avoid setting accumulators to zero in gemm kernels
2 parents 514a3d7 + 317ff27 commit 59e01b1

File tree

2 files changed

+222
-138
lines changed

2 files changed

+222
-138
lines changed

kernel/power/dgemm_kernel_power10.c

Lines changed: 94 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -87,22 +87,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
8787
rowC[0] += result[1] * alpha;
8888
#endif
8989

90-
#define SET_ACC_ZERO4() \
91-
__builtin_mma_xxsetaccz (&acc0); \
92-
__builtin_mma_xxsetaccz (&acc1); \
93-
__builtin_mma_xxsetaccz (&acc2); \
94-
__builtin_mma_xxsetaccz (&acc3);
95-
96-
#define SET_ACC_ZERO8() \
97-
__builtin_mma_xxsetaccz (&acc0); \
98-
__builtin_mma_xxsetaccz (&acc1); \
99-
__builtin_mma_xxsetaccz (&acc2); \
100-
__builtin_mma_xxsetaccz (&acc3); \
101-
__builtin_mma_xxsetaccz (&acc4); \
102-
__builtin_mma_xxsetaccz (&acc5); \
103-
__builtin_mma_xxsetaccz (&acc6); \
104-
__builtin_mma_xxsetaccz (&acc7);
105-
10690
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
10791

10892
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
@@ -210,12 +194,22 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
210194
PREFETCH1 (CO + ldc + ldc, 128);
211195
PREFETCH1 (CO + ldc + ldc + ldc, 128);
212196
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
213-
SET_ACC_ZERO8 ();
214-
for (l = 0; l < temp; l++)
197+
vec_t *rowA = (vec_t *) & AO[0];
198+
__vector_pair rowB;
199+
vec_t *rb = (vec_t *) & BO[0];
200+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
201+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
202+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
203+
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
204+
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
205+
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
206+
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
207+
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
208+
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
209+
for (l = 1; l < temp; l++)
215210
{
216-
vec_t *rowA = (vec_t *) & AO[l << 4];
217-
__vector_pair rowB;
218-
vec_t *rb = (vec_t *) & BO[l << 2];
211+
rowA = (vec_t *) & AO[l << 4];
212+
rb = (vec_t *) & BO[l << 2];
219213
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
220214
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
221215
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@@ -254,13 +248,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
254248
v4sf_t *rowC;
255249
v4sf_t result[4];
256250
__vector_quad acc0, acc1, acc2, acc3;
257-
SET_ACC_ZERO4 ();
258251
BLASLONG l = 0;
259-
for (l = 0; l < temp; l++)
252+
vec_t *rowA = (vec_t *) & AO[0];
253+
__vector_pair rowB;
254+
vec_t *rb = (vec_t *) & BO[0];
255+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
256+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
257+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
258+
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
259+
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
260+
for (l = 1; l < temp; l++)
260261
{
261-
vec_t *rowA = (vec_t *) & AO[l << 3];
262-
__vector_pair rowB;
263-
vec_t *rb = (vec_t *) & BO[l << 2];
262+
rowA = (vec_t *) & AO[l << 3];
263+
rb = (vec_t *) & BO[l << 2];
264264
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
265265
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
266266
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@@ -291,14 +291,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
291291
v4sf_t *rowC;
292292
v4sf_t result[4];
293293
__vector_quad acc0, acc1;
294-
__builtin_mma_xxsetaccz (&acc0);
295-
__builtin_mma_xxsetaccz (&acc1);
296294
BLASLONG l = 0;
297-
for (l = 0; l < temp; l++)
295+
vec_t *rowA = (vec_t *) & AO[0];
296+
__vector_pair rowB;
297+
vec_t *rb = (vec_t *) & BO[0];
298+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
299+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
300+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
301+
for (l = 1; l < temp; l++)
298302
{
299-
vec_t *rowA = (vec_t *) & AO[l << 2];
300-
__vector_pair rowB;
301-
vec_t *rb = (vec_t *) & BO[l << 2];
303+
rowA = (vec_t *) & AO[l << 2];
304+
rb = (vec_t *) & BO[l << 2];
302305
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
303306
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
304307
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@@ -325,13 +328,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
325328
v4sf_t *rowC;
326329
v4sf_t result[4];
327330
__vector_quad acc0;
328-
__builtin_mma_xxsetaccz (&acc0);
329331
BLASLONG l = 0;
330-
for (l = 0; l < temp; l++)
332+
vec_t *rowA = (vec_t *) & AO[0];
333+
__vector_pair rowB;
334+
vec_t *rb = (vec_t *) & BO[0];
335+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
336+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
337+
for (l = 1; l < temp; l++)
331338
{
332-
vec_t *rowA = (vec_t *) & AO[l << 1];
333-
__vector_pair rowB;
334-
vec_t *rb = (vec_t *) & BO[l << 2];
339+
rowA = (vec_t *) & AO[l << 1];
340+
rb = (vec_t *) & BO[l << 2];
335341
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
336342
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
337343
}
@@ -414,16 +420,27 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
414420
v4sf_t *rowC;
415421
v4sf_t result[4];
416422
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
417-
SET_ACC_ZERO8 ();
418423
BLASLONG l = 0;
419-
for (l = 0; l < temp; l++)
424+
FLOAT t[4] = { 0, 0, 0, 0 };
425+
t[0] = BO[0], t[1] = BO[1];
426+
__vector_pair rowB;
427+
vec_t *rb = (vec_t *) & t[0];
428+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
429+
vec_t *rowA = (vec_t *) & AO[0];
430+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
431+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
432+
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
433+
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
434+
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
435+
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
436+
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
437+
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
438+
for (l = 1; l < temp; l++)
420439
{
421-
FLOAT t[4] = { 0, 0, 0, 0 };
422440
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
423-
__vector_pair rowB;
424-
vec_t *rb = (vec_t *) & t[0];
441+
rb = (vec_t *) & t[0];
425442
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
426-
vec_t *rowA = (vec_t *) & AO[l << 4];
443+
rowA = (vec_t *) & AO[l << 4];
427444
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
428445
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
429446
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@@ -461,16 +478,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
461478
v4sf_t *rowC;
462479
v4sf_t result[4];
463480
__vector_quad acc0, acc1, acc2, acc3;
464-
SET_ACC_ZERO4 ();
465481
BLASLONG l = 0;
466-
for (l = 0; l < temp; l++)
482+
FLOAT t[4] = { 0, 0, 0, 0 };
483+
t[0] = BO[0], t[1] = BO[1];
484+
__vector_pair rowB;
485+
vec_t *rb = (vec_t *) & t[0];
486+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
487+
vec_t *rowA = (vec_t *) & AO[0];
488+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
489+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
490+
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
491+
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
492+
for (l = 1; l < temp; l++)
467493
{
468-
FLOAT t[4] = { 0, 0, 0, 0 };
469494
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
470-
__vector_pair rowB;
471-
vec_t *rb = (vec_t *) & t[0];
495+
rb = (vec_t *) & t[0];
472496
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
473-
vec_t *rowA = (vec_t *) & AO[l << 3];
497+
rowA = (vec_t *) & AO[l << 3];
474498
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
475499
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
476500
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@@ -500,17 +524,21 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
500524
v4sf_t *rowC;
501525
v4sf_t result[4];
502526
__vector_quad acc0, acc1;
503-
__builtin_mma_xxsetaccz (&acc0);
504-
__builtin_mma_xxsetaccz (&acc1);
505527
BLASLONG l = 0;
506-
for (l = 0; l < temp; l++)
528+
FLOAT t[4] = { 0, 0, 0, 0 };
529+
t[0] = BO[0], t[1] = BO[1];
530+
__vector_pair rowB;
531+
vec_t *rb = (vec_t *) & t[0];
532+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
533+
vec_t *rowA = (vec_t *) & AO[0];
534+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
535+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
536+
for (l = 1; l < temp; l++)
507537
{
508-
FLOAT t[4] = { 0, 0, 0, 0 };
509538
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
510-
__vector_pair rowB;
511-
vec_t *rb = (vec_t *) & t[0];
539+
rb = (vec_t *) & t[0];
512540
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
513-
vec_t *rowA = (vec_t *) & AO[l << 2];
541+
rowA = (vec_t *) & AO[l << 2];
514542
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
515543
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
516544
}
@@ -536,16 +564,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
536564
v4sf_t *rowC;
537565
v4sf_t result[4];
538566
__vector_quad acc0;
539-
__builtin_mma_xxsetaccz (&acc0);
540567
BLASLONG l = 0;
541-
for (l = 0; l < temp; l++)
568+
FLOAT t[4] = { 0, 0, 0, 0 };
569+
t[0] = BO[0], t[1] = BO[1];
570+
__vector_pair rowB;
571+
vec_t *rb = (vec_t *) & t[0];
572+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
573+
vec_t *rowA = (vec_t *) & AO[0];
574+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
575+
for (l = 1; l < temp; l++)
542576
{
543-
FLOAT t[4] = { 0, 0, 0, 0 };
544577
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
545-
__vector_pair rowB;
546-
vec_t *rb = (vec_t *) & t[0];
578+
rb = (vec_t *) & t[0];
547579
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
548-
vec_t *rowA = (vec_t *) & AO[l << 1];
580+
rowA = (vec_t *) & AO[l << 1];
549581
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
550582
}
551583
SAVE2x4_ACC (&acc0, 0);

0 commit comments

Comments
 (0)