@@ -87,22 +87,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
87
87
rowC[0] += result[1] * alpha;
88
88
#endif
89
89
90
- #define SET_ACC_ZERO4 () \
91
- __builtin_mma_xxsetaccz (&acc0); \
92
- __builtin_mma_xxsetaccz (&acc1); \
93
- __builtin_mma_xxsetaccz (&acc2); \
94
- __builtin_mma_xxsetaccz (&acc3);
95
-
96
- #define SET_ACC_ZERO8 () \
97
- __builtin_mma_xxsetaccz (&acc0); \
98
- __builtin_mma_xxsetaccz (&acc1); \
99
- __builtin_mma_xxsetaccz (&acc2); \
100
- __builtin_mma_xxsetaccz (&acc3); \
101
- __builtin_mma_xxsetaccz (&acc4); \
102
- __builtin_mma_xxsetaccz (&acc5); \
103
- __builtin_mma_xxsetaccz (&acc6); \
104
- __builtin_mma_xxsetaccz (&acc7);
105
-
106
90
#define PREFETCH1 (x , y ) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
107
91
108
92
#if (defined(LEFT ) && !defined(TRANSA )) || (!defined(LEFT ) && defined(TRANSA ))
@@ -210,12 +194,22 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
210
194
PREFETCH1 (CO + ldc + ldc , 128 );
211
195
PREFETCH1 (CO + ldc + ldc + ldc , 128 );
212
196
__vector_quad acc0 , acc1 , acc2 , acc3 , acc4 , acc5 , acc6 , acc7 ;
213
- SET_ACC_ZERO8 ();
214
- for (l = 0 ; l < temp ; l ++ )
197
+ vec_t * rowA = (vec_t * ) & AO [0 ];
198
+ __vector_pair rowB ;
199
+ vec_t * rb = (vec_t * ) & BO [0 ];
200
+ __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
201
+ __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
202
+ __builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
203
+ __builtin_mma_xvf64ger (& acc2 , rowB , rowA [2 ]);
204
+ __builtin_mma_xvf64ger (& acc3 , rowB , rowA [3 ]);
205
+ __builtin_mma_xvf64ger (& acc4 , rowB , rowA [4 ]);
206
+ __builtin_mma_xvf64ger (& acc5 , rowB , rowA [5 ]);
207
+ __builtin_mma_xvf64ger (& acc6 , rowB , rowA [6 ]);
208
+ __builtin_mma_xvf64ger (& acc7 , rowB , rowA [7 ]);
209
+ for (l = 1 ; l < temp ; l ++ )
215
210
{
216
- vec_t * rowA = (vec_t * ) & AO [l << 4 ];
217
- __vector_pair rowB ;
218
- vec_t * rb = (vec_t * ) & BO [l << 2 ];
211
+ rowA = (vec_t * ) & AO [l << 4 ];
212
+ rb = (vec_t * ) & BO [l << 2 ];
219
213
__builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
220
214
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
221
215
__builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
@@ -254,13 +248,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
254
248
v4sf_t * rowC ;
255
249
v4sf_t result [4 ];
256
250
__vector_quad acc0 , acc1 , acc2 , acc3 ;
257
- SET_ACC_ZERO4 ();
258
251
BLASLONG l = 0 ;
259
- for (l = 0 ; l < temp ; l ++ )
252
+ vec_t * rowA = (vec_t * ) & AO [0 ];
253
+ __vector_pair rowB ;
254
+ vec_t * rb = (vec_t * ) & BO [0 ];
255
+ __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
256
+ __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
257
+ __builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
258
+ __builtin_mma_xvf64ger (& acc2 , rowB , rowA [2 ]);
259
+ __builtin_mma_xvf64ger (& acc3 , rowB , rowA [3 ]);
260
+ for (l = 1 ; l < temp ; l ++ )
260
261
{
261
- vec_t * rowA = (vec_t * ) & AO [l << 3 ];
262
- __vector_pair rowB ;
263
- vec_t * rb = (vec_t * ) & BO [l << 2 ];
262
+ rowA = (vec_t * ) & AO [l << 3 ];
263
+ rb = (vec_t * ) & BO [l << 2 ];
264
264
__builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
265
265
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
266
266
__builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
@@ -291,14 +291,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
291
291
v4sf_t * rowC ;
292
292
v4sf_t result [4 ];
293
293
__vector_quad acc0 , acc1 ;
294
- __builtin_mma_xxsetaccz (& acc0 );
295
- __builtin_mma_xxsetaccz (& acc1 );
296
294
BLASLONG l = 0 ;
297
- for (l = 0 ; l < temp ; l ++ )
295
+ vec_t * rowA = (vec_t * ) & AO [0 ];
296
+ __vector_pair rowB ;
297
+ vec_t * rb = (vec_t * ) & BO [0 ];
298
+ __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
299
+ __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
300
+ __builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
301
+ for (l = 1 ; l < temp ; l ++ )
298
302
{
299
- vec_t * rowA = (vec_t * ) & AO [l << 2 ];
300
- __vector_pair rowB ;
301
- vec_t * rb = (vec_t * ) & BO [l << 2 ];
303
+ rowA = (vec_t * ) & AO [l << 2 ];
304
+ rb = (vec_t * ) & BO [l << 2 ];
302
305
__builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
303
306
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
304
307
__builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
@@ -325,13 +328,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
325
328
v4sf_t * rowC ;
326
329
v4sf_t result [4 ];
327
330
__vector_quad acc0 ;
328
- __builtin_mma_xxsetaccz (& acc0 );
329
331
BLASLONG l = 0 ;
330
- for (l = 0 ; l < temp ; l ++ )
332
+ vec_t * rowA = (vec_t * ) & AO [0 ];
333
+ __vector_pair rowB ;
334
+ vec_t * rb = (vec_t * ) & BO [0 ];
335
+ __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
336
+ __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
337
+ for (l = 1 ; l < temp ; l ++ )
331
338
{
332
- vec_t * rowA = (vec_t * ) & AO [l << 1 ];
333
- __vector_pair rowB ;
334
- vec_t * rb = (vec_t * ) & BO [l << 2 ];
339
+ rowA = (vec_t * ) & AO [l << 1 ];
340
+ rb = (vec_t * ) & BO [l << 2 ];
335
341
__builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
336
342
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
337
343
}
@@ -414,16 +420,27 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
414
420
v4sf_t * rowC ;
415
421
v4sf_t result [4 ];
416
422
__vector_quad acc0 , acc1 , acc2 , acc3 , acc4 , acc5 , acc6 , acc7 ;
417
- SET_ACC_ZERO8 ();
418
423
BLASLONG l = 0 ;
419
- for (l = 0 ; l < temp ; l ++ )
424
+ FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
425
+ t [0 ] = BO [0 ], t [1 ] = BO [1 ];
426
+ __vector_pair rowB ;
427
+ vec_t * rb = (vec_t * ) & t [0 ];
428
+ __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
429
+ vec_t * rowA = (vec_t * ) & AO [0 ];
430
+ __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
431
+ __builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
432
+ __builtin_mma_xvf64ger (& acc2 , rowB , rowA [2 ]);
433
+ __builtin_mma_xvf64ger (& acc3 , rowB , rowA [3 ]);
434
+ __builtin_mma_xvf64ger (& acc4 , rowB , rowA [4 ]);
435
+ __builtin_mma_xvf64ger (& acc5 , rowB , rowA [5 ]);
436
+ __builtin_mma_xvf64ger (& acc6 , rowB , rowA [6 ]);
437
+ __builtin_mma_xvf64ger (& acc7 , rowB , rowA [7 ]);
438
+ for (l = 1 ; l < temp ; l ++ )
420
439
{
421
- FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
422
440
t [0 ] = BO [l << 1 ], t [1 ] = BO [(l << 1 ) + 1 ];
423
- __vector_pair rowB ;
424
- vec_t * rb = (vec_t * ) & t [0 ];
441
+ rb = (vec_t * ) & t [0 ];
425
442
__builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
426
- vec_t * rowA = (vec_t * ) & AO [l << 4 ];
443
+ rowA = (vec_t * ) & AO [l << 4 ];
427
444
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
428
445
__builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
429
446
__builtin_mma_xvf64gerpp (& acc2 , rowB , rowA [2 ]);
@@ -461,16 +478,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
461
478
v4sf_t * rowC ;
462
479
v4sf_t result [4 ];
463
480
__vector_quad acc0 , acc1 , acc2 , acc3 ;
464
- SET_ACC_ZERO4 ();
465
481
BLASLONG l = 0 ;
466
- for (l = 0 ; l < temp ; l ++ )
482
+ FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
483
+ t [0 ] = BO [0 ], t [1 ] = BO [1 ];
484
+ __vector_pair rowB ;
485
+ vec_t * rb = (vec_t * ) & t [0 ];
486
+ __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
487
+ vec_t * rowA = (vec_t * ) & AO [0 ];
488
+ __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
489
+ __builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
490
+ __builtin_mma_xvf64ger (& acc2 , rowB , rowA [2 ]);
491
+ __builtin_mma_xvf64ger (& acc3 , rowB , rowA [3 ]);
492
+ for (l = 1 ; l < temp ; l ++ )
467
493
{
468
- FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
469
494
t [0 ] = BO [l << 1 ], t [1 ] = BO [(l << 1 ) + 1 ];
470
- __vector_pair rowB ;
471
- vec_t * rb = (vec_t * ) & t [0 ];
495
+ rb = (vec_t * ) & t [0 ];
472
496
__builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
473
- vec_t * rowA = (vec_t * ) & AO [l << 3 ];
497
+ rowA = (vec_t * ) & AO [l << 3 ];
474
498
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
475
499
__builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
476
500
__builtin_mma_xvf64gerpp (& acc2 , rowB , rowA [2 ]);
@@ -500,17 +524,21 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
500
524
v4sf_t * rowC ;
501
525
v4sf_t result [4 ];
502
526
__vector_quad acc0 , acc1 ;
503
- __builtin_mma_xxsetaccz (& acc0 );
504
- __builtin_mma_xxsetaccz (& acc1 );
505
527
BLASLONG l = 0 ;
506
- for (l = 0 ; l < temp ; l ++ )
528
+ FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
529
+ t [0 ] = BO [0 ], t [1 ] = BO [1 ];
530
+ __vector_pair rowB ;
531
+ vec_t * rb = (vec_t * ) & t [0 ];
532
+ __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
533
+ vec_t * rowA = (vec_t * ) & AO [0 ];
534
+ __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
535
+ __builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
536
+ for (l = 1 ; l < temp ; l ++ )
507
537
{
508
- FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
509
538
t [0 ] = BO [l << 1 ], t [1 ] = BO [(l << 1 ) + 1 ];
510
- __vector_pair rowB ;
511
- vec_t * rb = (vec_t * ) & t [0 ];
539
+ rb = (vec_t * ) & t [0 ];
512
540
__builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
513
- vec_t * rowA = (vec_t * ) & AO [l << 2 ];
541
+ rowA = (vec_t * ) & AO [l << 2 ];
514
542
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
515
543
__builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
516
544
}
@@ -536,16 +564,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
536
564
v4sf_t * rowC ;
537
565
v4sf_t result [4 ];
538
566
__vector_quad acc0 ;
539
- __builtin_mma_xxsetaccz (& acc0 );
540
567
BLASLONG l = 0 ;
541
- for (l = 0 ; l < temp ; l ++ )
568
+ FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
569
+ t [0 ] = BO [0 ], t [1 ] = BO [1 ];
570
+ __vector_pair rowB ;
571
+ vec_t * rb = (vec_t * ) & t [0 ];
572
+ __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
573
+ vec_t * rowA = (vec_t * ) & AO [0 ];
574
+ __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
575
+ for (l = 1 ; l < temp ; l ++ )
542
576
{
543
- FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
544
577
t [0 ] = BO [l << 1 ], t [1 ] = BO [(l << 1 ) + 1 ];
545
- __vector_pair rowB ;
546
- vec_t * rb = (vec_t * ) & t [0 ];
578
+ rb = (vec_t * ) & t [0 ];
547
579
__builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
548
- vec_t * rowA = (vec_t * ) & AO [l << 1 ];
580
+ rowA = (vec_t * ) & AO [l << 1 ];
549
581
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
550
582
}
551
583
SAVE2x4_ACC (& acc0 , 0 );
0 commit comments