@@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
29
30
30
typedef __vector unsigned char vec_t ;
31
31
typedef FLOAT v4sf_t __attribute__ ((vector_size (16 )));
32
- typedef FLOAT v2sf_t __attribute__ ((vector_size (8 )));
32
+ #if !__has_builtin (__builtin_vsx_assemble_pair )
33
+ #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
34
+ #endif
35
+
36
+ #if !__has_builtin (__builtin_vsx_disassemble_pair )
37
+ #define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
38
+ #endif
33
39
34
40
#ifdef TRMMKERNEL
35
41
#define SAVE_ACC (ACC , J ) \
@@ -186,8 +192,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
186
192
vec_t * rowA = (vec_t * ) & AO [0 ];
187
193
vec_t * rb = (vec_t * ) & BO [0 ];
188
194
__vector_pair rowB , rowB1 ;
189
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
190
- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
195
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
196
+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
191
197
__builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
192
198
__builtin_mma_xvf64ger (& acc1 , rowB1 , rowA [0 ]);
193
199
__builtin_mma_xvf64ger (& acc2 , rowB , rowA [1 ]);
@@ -200,8 +206,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
200
206
{
201
207
rowA = (vec_t * ) & AO [l << 3 ];
202
208
rb = (vec_t * ) & BO [l << 3 ];
203
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
204
- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
209
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
210
+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
205
211
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
206
212
__builtin_mma_xvf64gerpp (& acc1 , rowB1 , rowA [0 ]);
207
213
__builtin_mma_xvf64gerpp (& acc2 , rowB , rowA [1 ]);
@@ -242,8 +248,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
242
248
vec_t * rowA = (vec_t * ) & AO [0 ];
243
249
__vector_pair rowB , rowB1 ;
244
250
vec_t * rb = (vec_t * ) & BO [0 ];
245
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
246
- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
251
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
252
+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
247
253
__builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
248
254
__builtin_mma_xvf64ger (& acc1 , rowB1 , rowA [0 ]);
249
255
__builtin_mma_xvf64ger (& acc2 , rowB , rowA [1 ]);
@@ -252,8 +258,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
252
258
{
253
259
rowA = (vec_t * ) & AO [l << 2 ];
254
260
rb = (vec_t * ) & BO [l << 3 ];
255
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
256
- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
261
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
262
+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
257
263
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
258
264
__builtin_mma_xvf64gerpp (& acc1 , rowB1 , rowA [0 ]);
259
265
__builtin_mma_xvf64gerpp (& acc2 , rowB , rowA [1 ]);
@@ -286,16 +292,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
286
292
vec_t * rowA = (vec_t * ) & AO [0 ];
287
293
__vector_pair rowB , rowB1 ;
288
294
vec_t * rb = (vec_t * ) & BO [0 ];
289
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
290
- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
295
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
296
+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
291
297
__builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
292
298
__builtin_mma_xvf64ger (& acc1 , rowB1 , rowA [0 ]);
293
299
for (l = 1 ; l < temp ; l ++ )
294
300
{
295
301
rowA = (vec_t * ) & AO [l << 1 ];
296
302
rb = (vec_t * ) & BO [l << 3 ];
297
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
298
- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
303
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
304
+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
299
305
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
300
306
__builtin_mma_xvf64gerpp (& acc1 , rowB1 , rowA [0 ]);
301
307
}
@@ -398,7 +404,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
398
404
vec_t * rowA = (vec_t * ) & AO [0 ];
399
405
__vector_pair rowB ;
400
406
vec_t * rb = (vec_t * ) & BO [0 ];
401
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
407
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
402
408
__builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
403
409
__builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
404
410
__builtin_mma_xvf64ger (& acc2 , rowB , rowA [2 ]);
@@ -407,7 +413,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
407
413
{
408
414
rowA = (vec_t * ) & AO [l << 3 ];
409
415
rb = (vec_t * ) & BO [l << 2 ];
410
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
416
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
411
417
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
412
418
__builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
413
419
__builtin_mma_xvf64gerpp (& acc2 , rowB , rowA [2 ]);
@@ -440,14 +446,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
440
446
vec_t * rowA = (vec_t * ) & AO [0 ];
441
447
__vector_pair rowB ;
442
448
vec_t * rb = (vec_t * ) & BO [0 ];
443
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
449
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
444
450
__builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
445
451
__builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
446
452
for (l = 1 ; l < temp ; l ++ )
447
453
{
448
454
rowA = (vec_t * ) & AO [l << 2 ];
449
455
rb = (vec_t * ) & BO [l << 2 ];
450
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
456
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
451
457
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
452
458
__builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
453
459
}
@@ -476,13 +482,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
476
482
vec_t * rowA = (vec_t * ) & AO [0 ];
477
483
__vector_pair rowB ;
478
484
vec_t * rb = (vec_t * ) & BO [0 ];
479
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
485
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
480
486
__builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
481
487
for (l = 1 ; l < temp ; l ++ )
482
488
{
483
489
rowA = (vec_t * ) & AO [l << 1 ];
484
490
rb = (vec_t * ) & BO [l << 2 ];
485
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
491
+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
486
492
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
487
493
}
488
494
SAVE_ACC (& acc0 , 0 );
@@ -562,21 +568,18 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
562
568
v4sf_t result [4 ];
563
569
__vector_quad acc0 , acc1 , acc2 , acc3 ;
564
570
BLASLONG l = 0 ;
565
- FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
566
- t [0 ] = BO [0 ], t [1 ] = BO [1 ];
567
571
__vector_pair rowB ;
568
- vec_t * rb = (vec_t * ) & t [0 ];
569
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
572
+ vec_t * rb = (vec_t * ) & BO [0 ];
573
+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
570
574
vec_t * rowA = (vec_t * ) & AO [0 ];
571
575
__builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
572
576
__builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
573
577
__builtin_mma_xvf64ger (& acc2 , rowB , rowA [2 ]);
574
578
__builtin_mma_xvf64ger (& acc3 , rowB , rowA [3 ]);
575
579
for (l = 1 ; l < temp ; l ++ )
576
580
{
577
- t [0 ] = BO [l << 1 ], t [1 ] = BO [(l << 1 ) + 1 ];
578
- rb = (vec_t * ) & t [0 ];
579
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
581
+ rb = (vec_t * ) & BO [l << 1 ];
582
+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
580
583
rowA = (vec_t * ) & AO [l << 3 ];
581
584
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
582
585
__builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
@@ -607,19 +610,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
607
610
v4sf_t result [4 ];
608
611
__vector_quad acc0 , acc1 ;
609
612
BLASLONG l = 0 ;
610
- FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
611
- t [0 ] = BO [0 ], t [1 ] = BO [1 ];
612
613
__vector_pair rowB ;
613
- vec_t * rb = (vec_t * ) & t [0 ];
614
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
614
+ vec_t * rb = (vec_t * ) & BO [0 ];
615
+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
615
616
vec_t * rowA = (vec_t * ) & AO [0 ];
616
617
__builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
617
618
__builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
618
619
for (l = 1 ; l < temp ; l ++ )
619
620
{
620
- t [0 ] = BO [l << 1 ], t [1 ] = BO [(l << 1 ) + 1 ];
621
- rb = (vec_t * ) & t [0 ];
622
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
621
+ rb = (vec_t * ) & BO [l << 1 ];
622
+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
623
623
rowA = (vec_t * ) & AO [l << 2 ];
624
624
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
625
625
__builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
@@ -646,18 +646,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
646
646
v4sf_t result [4 ];
647
647
__vector_quad acc0 ;
648
648
BLASLONG l = 0 ;
649
- FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
650
- t [0 ] = BO [0 ], t [1 ] = BO [1 ];
651
649
__vector_pair rowB ;
652
- vec_t * rb = (vec_t * ) & t [0 ];
653
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
650
+ vec_t * rb = (vec_t * ) & BO [0 ];
651
+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
654
652
vec_t * rowA = (vec_t * ) & AO [0 ];
655
653
__builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
656
654
for (l = 1 ; l < temp ; l ++ )
657
655
{
658
- t [0 ] = BO [l << 1 ], t [1 ] = BO [(l << 1 ) + 1 ];
659
- rb = (vec_t * ) & t [0 ];
660
- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
656
+ rb = (vec_t * ) & BO [l << 1 ];
657
+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
661
658
rowA = (vec_t * ) & AO [l << 1 ];
662
659
__builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
663
660
}
0 commit comments