@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
26
*****************************************************************************/
27
27
28
28
/**************************************************************************************
29
- * 2016/04/03 Werner Saar ([email protected] )
29
+ * 2016/04/04 Werner Saar ([email protected] )
30
30
* BLASTEST : OK
31
31
* CTEST : OK
32
32
* TEST : OK
@@ -38,6 +38,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
38
39
39
CGEMM_L4_BEGIN:
40
40
41
+ mr BO, B
42
+ mr BBO, BBUFFER
43
+ slwi T1, K, 3
44
+
45
+ CGEMM_L4_COPYB:
46
+ dcbtst BBO, PRE
47
+
48
+ lxvw4x vs3, o0, BO
49
+ lxvw4x vs11, o16, BO
50
+ xxspltw vs4, vs3, 0
51
+ xxspltw vs5, vs3, 1
52
+ xxspltw vs6, vs3, 2
53
+ xxspltw vs7, vs3, 3
54
+ xxspltw vs12, vs11, 0
55
+ xxspltw vs13, vs11, 1
56
+ xxspltw vs14, vs11, 2
57
+ xxspltw vs15, vs11, 3
58
+ stxvw4x vs4, o0, BBO
59
+ stxvw4x vs5, o16, BBO
60
+ stxvw4x vs6, o32, BBO
61
+ stxvw4x vs7, o48, BBO
62
+ addi BO, BO, 32
63
+ addi BBO, BBO, 64
64
+ stxvw4x vs12, o0, BBO
65
+ stxvw4x vs13, o16, BBO
66
+ stxvw4x vs14, o32, BBO
67
+ stxvw4x vs15, o48, BBO
68
+ addic. T1, T1, -8
69
+ addi BBO, BBO, 64
70
+
71
+ bge CGEMM_L4_COPYB
72
+
73
+
41
74
mr CO, C
42
75
mr AO, A
43
76
slwi T1, LDC , 2
@@ -48,7 +81,7 @@ CGEMM_L4_BEGIN:
48
81
CGEMM_L4x8_BEGIN:
49
82
50
83
51
- mr BO, B
84
+ mr BO, BBUFFER
52
85
srawi. L, K, 3
53
86
ble CGEMM_L4x8_SUB0
54
87
cmpwi cr0, L, 1
@@ -59,18 +92,25 @@ CGEMM_L4x8_LOOP_START:
59
92
dcbt AO, PRE
60
93
dcbt BO, PRE
61
94
LOAD4x8_1
95
+ dcbt BO, PRE
62
96
KERNEL4x8_I1
97
+ dcbt BO, PRE
63
98
dcbt AO, PRE
64
99
KERNEL4x8_2
100
+ dcbt BO, PRE
65
101
KERNEL4x8_1
102
+ dcbt BO, PRE
66
103
dcbt AO, PRE
67
104
KERNEL4x8_2
68
105
106
+ dcbt BO, PRE
69
107
KERNEL4x8_1
70
- dcbt AO, PRE
71
108
dcbt BO, PRE
109
+ dcbt AO, PRE
72
110
KERNEL4x8_2
111
+ dcbt BO, PRE
73
112
KERNEL4x8_1
113
+ dcbt BO, PRE
74
114
dcbt AO, PRE
75
115
KERNEL4x8_2
76
116
@@ -81,18 +121,25 @@ CGEMM_L4x8_LOOP_START:
81
121
82
122
CGEMM_L4x8_LOOP:
83
123
124
+ dcbt BO, PRE
84
125
KERNEL4x8_1
126
+ dcbt BO, PRE
85
127
dcbt AO, PRE
86
128
KERNEL4x8_2
129
+ dcbt BO, PRE
87
130
KERNEL4x8_1
131
+ dcbt BO, PRE
88
132
dcbt AO, PRE
89
133
KERNEL4x8_2
90
134
135
+ dcbt BO, PRE
91
136
KERNEL4x8_1
92
- dcbt AO, PRE
93
137
dcbt BO, PRE
138
+ dcbt AO, PRE
94
139
KERNEL4x8_2
140
+ dcbt BO, PRE
95
141
KERNEL4x8_1
142
+ dcbt BO, PRE
96
143
dcbt AO, PRE
97
144
KERNEL4x8_2
98
145
@@ -101,7 +148,9 @@ CGEMM_L4x8_LOOP:
101
148
102
149
CGEMM_L4x8_LOOP_END:
103
150
151
+ dcbt BO, PRE
104
152
KERNEL4x8_1
153
+ dcbt BO, PRE
105
154
dcbt AO, PRE
106
155
KERNEL4x8_2
107
156
KERNEL4x8_1
@@ -168,7 +217,7 @@ CGEMM_L4x4_BEGIN:
168
217
169
218
andi. T1, M, 4
170
219
ble CGEMM_L4x4_END
171
- mr BO, B
220
+ mr BO, BBUFFER
172
221
srawi. L, K, 3
173
222
ble CGEMM_L4x4_SUB0
174
223
cmpwi cr0, L, 1
@@ -268,7 +317,7 @@ CGEMM_L4x2_BEGIN:
268
317
269
318
andi. T1, M, 2
270
319
ble CGEMM_L4x2_END
271
- mr BO, B
320
+ mr BO, BBUFFER
272
321
srawi. L, K, 3
273
322
ble CGEMM_L4x2_SUB0
274
323
cmpwi cr0, L, 1
@@ -368,7 +417,7 @@ CGEMM_L4x1_BEGIN:
368
417
369
418
andi. T1, M, 1
370
419
ble CGEMM_L4x1_END
371
- mr BO, B
420
+ mr BO, BBUFFER
372
421
srawi. L, K, 3
373
422
ble CGEMM_L4x1_SUB0
374
423
cmpwi cr0, L, 1
@@ -482,6 +531,39 @@ L999_H1:
482
531
483
532
CGEMM_L2_BEGIN:
484
533
534
+ mr BO, B
535
+ mr BBO, BBUFFER
536
+ slwi T1, K, 2
537
+
538
+ CGEMM_L2_COPYB:
539
+ dcbtst BBO, PRE
540
+
541
+ lxvw4x vs3, o0, BO
542
+ lxvw4x vs11, o16, BO
543
+ xxspltw vs4, vs3, 0
544
+ xxspltw vs5, vs3, 1
545
+ xxspltw vs6, vs3, 2
546
+ xxspltw vs7, vs3, 3
547
+ xxspltw vs12, vs11, 0
548
+ xxspltw vs13, vs11, 1
549
+ xxspltw vs14, vs11, 2
550
+ xxspltw vs15, vs11, 3
551
+ stxvw4x vs4, o0, BBO
552
+ stxvw4x vs5, o16, BBO
553
+ stxvw4x vs6, o32, BBO
554
+ stxvw4x vs7, o48, BBO
555
+ addi BO, BO, 32
556
+ addi BBO, BBO, 64
557
+ stxvw4x vs12, o0, BBO
558
+ stxvw4x vs13, o16, BBO
559
+ stxvw4x vs14, o32, BBO
560
+ stxvw4x vs15, o48, BBO
561
+ addic. T1, T1, -8
562
+ addi BBO, BBO, 64
563
+
564
+ bge CGEMM_L2_COPYB
565
+
566
+
485
567
andi. T1, N, 2
486
568
ble CGEMM_L2_END
487
569
mr CO, C
@@ -494,7 +576,7 @@ CGEMM_L2_BEGIN:
494
576
CGEMM_L2x8_BEGIN:
495
577
496
578
497
- mr BO, B
579
+ mr BO, BBUFFER
498
580
srawi. L, K, 3
499
581
ble CGEMM_L2x8_SUB0
500
582
cmpwi cr0, L, 1
@@ -611,7 +693,7 @@ CGEMM_L2x4_BEGIN:
611
693
612
694
andi. T1, M, 4
613
695
ble CGEMM_L2x4_END
614
- mr BO, B
696
+ mr BO, BBUFFER
615
697
srawi. L, K, 3
616
698
ble CGEMM_L2x4_SUB0
617
699
cmpwi cr0, L, 1
@@ -711,7 +793,7 @@ CGEMM_L2x2_BEGIN:
711
793
712
794
andi. T1, M, 2
713
795
ble CGEMM_L2x2_END
714
- mr BO, B
796
+ mr BO, BBUFFER
715
797
srawi. L, K, 3
716
798
ble CGEMM_L2x2_SUB0
717
799
cmpwi cr0, L, 1
@@ -811,7 +893,7 @@ CGEMM_L2x1_BEGIN:
811
893
812
894
andi. T1, M, 1
813
895
ble CGEMM_L2x1_END
814
- mr BO, B
896
+ mr BO, BBUFFER
815
897
srawi. L, K, 3
816
898
ble CGEMM_L2x1_SUB0
817
899
cmpwi cr0, L, 1
@@ -919,6 +1001,39 @@ L999_H2:
919
1001
920
1002
CGEMM_L1_BEGIN:
921
1003
1004
+ mr BO, B
1005
+ mr BBO, BBUFFER
1006
+ slwi T1, K, 1
1007
+
1008
+ CGEMM_L1_COPYB:
1009
+ dcbtst BBO, PRE
1010
+
1011
+ lxvw4x vs3, o0, BO
1012
+ lxvw4x vs11, o16, BO
1013
+ xxspltw vs4, vs3, 0
1014
+ xxspltw vs5, vs3, 1
1015
+ xxspltw vs6, vs3, 2
1016
+ xxspltw vs7, vs3, 3
1017
+ xxspltw vs12, vs11, 0
1018
+ xxspltw vs13, vs11, 1
1019
+ xxspltw vs14, vs11, 2
1020
+ xxspltw vs15, vs11, 3
1021
+ stxvw4x vs4, o0, BBO
1022
+ stxvw4x vs5, o16, BBO
1023
+ stxvw4x vs6, o32, BBO
1024
+ stxvw4x vs7, o48, BBO
1025
+ addi BO, BO, 32
1026
+ addi BBO, BBO, 64
1027
+ stxvw4x vs12, o0, BBO
1028
+ stxvw4x vs13, o16, BBO
1029
+ stxvw4x vs14, o32, BBO
1030
+ stxvw4x vs15, o48, BBO
1031
+ addic. T1, T1, -8
1032
+ addi BBO, BBO, 64
1033
+
1034
+ bge CGEMM_L1_COPYB
1035
+
1036
+
922
1037
andi. T1, N, 1
923
1038
ble CGEMM_L1_END
924
1039
mr CO, C
@@ -929,7 +1044,7 @@ CGEMM_L1_BEGIN:
929
1044
CGEMM_L1x8_BEGIN:
930
1045
931
1046
932
- mr BO, B
1047
+ mr BO, BBUFFER
933
1048
srawi. L, K, 3
934
1049
ble CGEMM_L1x8_SUB0
935
1050
cmpwi cr0, L, 1
@@ -1046,7 +1161,7 @@ CGEMM_L1x4_BEGIN:
1046
1161
1047
1162
andi. T1, M, 4
1048
1163
ble CGEMM_L1x4_END
1049
- mr BO, B
1164
+ mr BO, BBUFFER
1050
1165
srawi. L, K, 3
1051
1166
ble CGEMM_L1x4_SUB0
1052
1167
cmpwi cr0, L, 1
@@ -1146,7 +1261,7 @@ CGEMM_L1x2_BEGIN:
1146
1261
1147
1262
andi. T1, M, 2
1148
1263
ble CGEMM_L1x2_END
1149
- mr BO, B
1264
+ mr BO, BBUFFER
1150
1265
srawi. L, K, 3
1151
1266
ble CGEMM_L1x2_SUB0
1152
1267
cmpwi cr0, L, 1
@@ -1246,7 +1361,7 @@ CGEMM_L1x1_BEGIN:
1246
1361
1247
1362
andi. T1, M, 1
1248
1363
ble CGEMM_L1x1_END
1249
- mr BO, B
1364
+ mr BO, BBUFFER
1250
1365
srawi. L, K, 3
1251
1366
ble CGEMM_L1x1_SUB0
1252
1367
cmpwi cr0, L, 1
0 commit comments