Skip to content

Commit 7282419

Browse files
committed
Merge pull request #833 from wernsaar/develop
updated optimized cgemm- and ctrmm-kernel for POWER8
2 parents e1cdd15 + c5b1fbc commit 7282419

File tree

7 files changed

+7393
-905
lines changed

7 files changed

+7393
-905
lines changed

kernel/power/cgemm_kernel_8x4_power8.S

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
2727

2828
/**************************************************************************************
29-
* 2016/04/03 Werner Saar ([email protected])
29+
* 2016/04/04 Werner Saar ([email protected])
3030
* BLASTEST : OK
3131
* CTEST : OK
3232
* TEST : OK
@@ -137,12 +137,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
137137
#define alpha_si vs31
138138

139139

140-
#define NOTUSED r14
140+
#define BBUFFER r14
141141
#define L r15
142142
#define o12 r16
143143
#define o4 r17
144144
#define T2 r19
145-
#define KK r20
145+
#define BBO r20
146146
#define o8 r21
147147
#define I r22
148148
#define J r23
@@ -290,6 +290,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
290290
li o32 , 32
291291
li o48 , 48
292292

293+
li T1, 256
294+
slwi T1, T1, 9 // 131072
295+
sub BBUFFER, A, T1 // temp buffer for B unrolled
296+
293297

294298
#ifdef __64BIT__
295299
addi T1 , SP, 296

kernel/power/cgemm_logic_8x4_power8.S

Lines changed: 130 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
2727

2828
/**************************************************************************************
29-
* 2016/04/03 Werner Saar ([email protected])
29+
* 2016/04/04 Werner Saar ([email protected])
3030
* BLASTEST : OK
3131
* CTEST : OK
3232
* TEST : OK
@@ -38,6 +38,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3838

3939
CGEMM_L4_BEGIN:
4040

41+
mr BO, B
42+
mr BBO, BBUFFER
43+
slwi T1, K, 3
44+
45+
CGEMM_L4_COPYB:
46+
dcbtst BBO, PRE
47+
48+
lxvw4x vs3, o0, BO
49+
lxvw4x vs11, o16, BO
50+
xxspltw vs4, vs3, 0
51+
xxspltw vs5, vs3, 1
52+
xxspltw vs6, vs3, 2
53+
xxspltw vs7, vs3, 3
54+
xxspltw vs12, vs11, 0
55+
xxspltw vs13, vs11, 1
56+
xxspltw vs14, vs11, 2
57+
xxspltw vs15, vs11, 3
58+
stxvw4x vs4, o0, BBO
59+
stxvw4x vs5, o16, BBO
60+
stxvw4x vs6, o32, BBO
61+
stxvw4x vs7, o48, BBO
62+
addi BO, BO, 32
63+
addi BBO, BBO, 64
64+
stxvw4x vs12, o0, BBO
65+
stxvw4x vs13, o16, BBO
66+
stxvw4x vs14, o32, BBO
67+
stxvw4x vs15, o48, BBO
68+
addic. T1, T1, -8
69+
addi BBO, BBO, 64
70+
71+
bge CGEMM_L4_COPYB
72+
73+
4174
mr CO, C
4275
mr AO, A
4376
slwi T1, LDC , 2
@@ -48,7 +81,7 @@ CGEMM_L4_BEGIN:
4881
CGEMM_L4x8_BEGIN:
4982

5083

51-
mr BO, B
84+
mr BO, BBUFFER
5285
srawi. L, K, 3
5386
ble CGEMM_L4x8_SUB0
5487
cmpwi cr0, L, 1
@@ -59,18 +92,25 @@ CGEMM_L4x8_LOOP_START:
5992
dcbt AO, PRE
6093
dcbt BO, PRE
6194
LOAD4x8_1
95+
dcbt BO, PRE
6296
KERNEL4x8_I1
97+
dcbt BO, PRE
6398
dcbt AO, PRE
6499
KERNEL4x8_2
100+
dcbt BO, PRE
65101
KERNEL4x8_1
102+
dcbt BO, PRE
66103
dcbt AO, PRE
67104
KERNEL4x8_2
68105

106+
dcbt BO, PRE
69107
KERNEL4x8_1
70-
dcbt AO, PRE
71108
dcbt BO, PRE
109+
dcbt AO, PRE
72110
KERNEL4x8_2
111+
dcbt BO, PRE
73112
KERNEL4x8_1
113+
dcbt BO, PRE
74114
dcbt AO, PRE
75115
KERNEL4x8_2
76116

@@ -81,18 +121,25 @@ CGEMM_L4x8_LOOP_START:
81121

82122
CGEMM_L4x8_LOOP:
83123

124+
dcbt BO, PRE
84125
KERNEL4x8_1
126+
dcbt BO, PRE
85127
dcbt AO, PRE
86128
KERNEL4x8_2
129+
dcbt BO, PRE
87130
KERNEL4x8_1
131+
dcbt BO, PRE
88132
dcbt AO, PRE
89133
KERNEL4x8_2
90134

135+
dcbt BO, PRE
91136
KERNEL4x8_1
92-
dcbt AO, PRE
93137
dcbt BO, PRE
138+
dcbt AO, PRE
94139
KERNEL4x8_2
140+
dcbt BO, PRE
95141
KERNEL4x8_1
142+
dcbt BO, PRE
96143
dcbt AO, PRE
97144
KERNEL4x8_2
98145

@@ -101,7 +148,9 @@ CGEMM_L4x8_LOOP:
101148

102149
CGEMM_L4x8_LOOP_END:
103150

151+
dcbt BO, PRE
104152
KERNEL4x8_1
153+
dcbt BO, PRE
105154
dcbt AO, PRE
106155
KERNEL4x8_2
107156
KERNEL4x8_1
@@ -168,7 +217,7 @@ CGEMM_L4x4_BEGIN:
168217

169218
andi. T1, M, 4
170219
ble CGEMM_L4x4_END
171-
mr BO, B
220+
mr BO, BBUFFER
172221
srawi. L, K, 3
173222
ble CGEMM_L4x4_SUB0
174223
cmpwi cr0, L, 1
@@ -268,7 +317,7 @@ CGEMM_L4x2_BEGIN:
268317

269318
andi. T1, M, 2
270319
ble CGEMM_L4x2_END
271-
mr BO, B
320+
mr BO, BBUFFER
272321
srawi. L, K, 3
273322
ble CGEMM_L4x2_SUB0
274323
cmpwi cr0, L, 1
@@ -368,7 +417,7 @@ CGEMM_L4x1_BEGIN:
368417

369418
andi. T1, M, 1
370419
ble CGEMM_L4x1_END
371-
mr BO, B
420+
mr BO, BBUFFER
372421
srawi. L, K, 3
373422
ble CGEMM_L4x1_SUB0
374423
cmpwi cr0, L, 1
@@ -482,6 +531,39 @@ L999_H1:
482531

483532
CGEMM_L2_BEGIN:
484533

534+
mr BO, B
535+
mr BBO, BBUFFER
536+
slwi T1, K, 2
537+
538+
CGEMM_L2_COPYB:
539+
dcbtst BBO, PRE
540+
541+
lxvw4x vs3, o0, BO
542+
lxvw4x vs11, o16, BO
543+
xxspltw vs4, vs3, 0
544+
xxspltw vs5, vs3, 1
545+
xxspltw vs6, vs3, 2
546+
xxspltw vs7, vs3, 3
547+
xxspltw vs12, vs11, 0
548+
xxspltw vs13, vs11, 1
549+
xxspltw vs14, vs11, 2
550+
xxspltw vs15, vs11, 3
551+
stxvw4x vs4, o0, BBO
552+
stxvw4x vs5, o16, BBO
553+
stxvw4x vs6, o32, BBO
554+
stxvw4x vs7, o48, BBO
555+
addi BO, BO, 32
556+
addi BBO, BBO, 64
557+
stxvw4x vs12, o0, BBO
558+
stxvw4x vs13, o16, BBO
559+
stxvw4x vs14, o32, BBO
560+
stxvw4x vs15, o48, BBO
561+
addic. T1, T1, -8
562+
addi BBO, BBO, 64
563+
564+
bge CGEMM_L2_COPYB
565+
566+
485567
andi. T1, N, 2
486568
ble CGEMM_L2_END
487569
mr CO, C
@@ -494,7 +576,7 @@ CGEMM_L2_BEGIN:
494576
CGEMM_L2x8_BEGIN:
495577

496578

497-
mr BO, B
579+
mr BO, BBUFFER
498580
srawi. L, K, 3
499581
ble CGEMM_L2x8_SUB0
500582
cmpwi cr0, L, 1
@@ -611,7 +693,7 @@ CGEMM_L2x4_BEGIN:
611693

612694
andi. T1, M, 4
613695
ble CGEMM_L2x4_END
614-
mr BO, B
696+
mr BO, BBUFFER
615697
srawi. L, K, 3
616698
ble CGEMM_L2x4_SUB0
617699
cmpwi cr0, L, 1
@@ -711,7 +793,7 @@ CGEMM_L2x2_BEGIN:
711793

712794
andi. T1, M, 2
713795
ble CGEMM_L2x2_END
714-
mr BO, B
796+
mr BO, BBUFFER
715797
srawi. L, K, 3
716798
ble CGEMM_L2x2_SUB0
717799
cmpwi cr0, L, 1
@@ -811,7 +893,7 @@ CGEMM_L2x1_BEGIN:
811893

812894
andi. T1, M, 1
813895
ble CGEMM_L2x1_END
814-
mr BO, B
896+
mr BO, BBUFFER
815897
srawi. L, K, 3
816898
ble CGEMM_L2x1_SUB0
817899
cmpwi cr0, L, 1
@@ -919,6 +1001,39 @@ L999_H2:
9191001

9201002
CGEMM_L1_BEGIN:
9211003

1004+
mr BO, B
1005+
mr BBO, BBUFFER
1006+
slwi T1, K, 1
1007+
1008+
CGEMM_L1_COPYB:
1009+
dcbtst BBO, PRE
1010+
1011+
lxvw4x vs3, o0, BO
1012+
lxvw4x vs11, o16, BO
1013+
xxspltw vs4, vs3, 0
1014+
xxspltw vs5, vs3, 1
1015+
xxspltw vs6, vs3, 2
1016+
xxspltw vs7, vs3, 3
1017+
xxspltw vs12, vs11, 0
1018+
xxspltw vs13, vs11, 1
1019+
xxspltw vs14, vs11, 2
1020+
xxspltw vs15, vs11, 3
1021+
stxvw4x vs4, o0, BBO
1022+
stxvw4x vs5, o16, BBO
1023+
stxvw4x vs6, o32, BBO
1024+
stxvw4x vs7, o48, BBO
1025+
addi BO, BO, 32
1026+
addi BBO, BBO, 64
1027+
stxvw4x vs12, o0, BBO
1028+
stxvw4x vs13, o16, BBO
1029+
stxvw4x vs14, o32, BBO
1030+
stxvw4x vs15, o48, BBO
1031+
addic. T1, T1, -8
1032+
addi BBO, BBO, 64
1033+
1034+
bge CGEMM_L1_COPYB
1035+
1036+
9221037
andi. T1, N, 1
9231038
ble CGEMM_L1_END
9241039
mr CO, C
@@ -929,7 +1044,7 @@ CGEMM_L1_BEGIN:
9291044
CGEMM_L1x8_BEGIN:
9301045

9311046

932-
mr BO, B
1047+
mr BO, BBUFFER
9331048
srawi. L, K, 3
9341049
ble CGEMM_L1x8_SUB0
9351050
cmpwi cr0, L, 1
@@ -1046,7 +1161,7 @@ CGEMM_L1x4_BEGIN:
10461161

10471162
andi. T1, M, 4
10481163
ble CGEMM_L1x4_END
1049-
mr BO, B
1164+
mr BO, BBUFFER
10501165
srawi. L, K, 3
10511166
ble CGEMM_L1x4_SUB0
10521167
cmpwi cr0, L, 1
@@ -1146,7 +1261,7 @@ CGEMM_L1x2_BEGIN:
11461261

11471262
andi. T1, M, 2
11481263
ble CGEMM_L1x2_END
1149-
mr BO, B
1264+
mr BO, BBUFFER
11501265
srawi. L, K, 3
11511266
ble CGEMM_L1x2_SUB0
11521267
cmpwi cr0, L, 1
@@ -1246,7 +1361,7 @@ CGEMM_L1x1_BEGIN:
12461361

12471362
andi. T1, M, 1
12481363
ble CGEMM_L1x1_END
1249-
mr BO, B
1364+
mr BO, BBUFFER
12501365
srawi. L, K, 3
12511366
ble CGEMM_L1x1_SUB0
12521367
cmpwi cr0, L, 1

0 commit comments

Comments
 (0)