Skip to content

Commit 8a149e6

Browse files
committed
Merge pull request #879 from wernsaar/develop
optimized dgemm and dgetrf for POWER8
2 parents 8bf71e9 + 956be69 commit 8a149e6

8 files changed

+97
-62
lines changed

common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,13 @@ typedef int blasint;
332332
#endif
333333
#endif
334334

335+
#ifdef POWER8
336+
#ifndef YIELDING
337+
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
338+
#endif
339+
#endif
340+
341+
335342
/*
336343
#ifdef PILEDRIVER
337344
#ifndef YIELDING

kernel/power/dgemm_logic_16x4_power8.S

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3333
* LAPACK-TEST : OK
3434
**************************************************************************************/
3535

36+
#define MY_ALIGN .align 3
3637

3738
srawi. J, N, 2
3839
ble LDGEMM_L4_END
@@ -53,7 +54,7 @@ LDGEMM_L4_BEGIN:
5354
srawi. I, M, 4
5455
ble LDGEMM_L4x16_END
5556

56-
.align 4
57+
MY_ALIGN
5758
LDGEMM_L4x16_BEGIN_FIRST:
5859

5960
li L, -128
@@ -90,7 +91,7 @@ LDGEMM_L4x16_BEGIN_FIRST:
9091
cmpwi cr0, L, 1
9192
ble LDGEMM_L4x16_SUB4_FIRST
9293

93-
.align 4
94+
MY_ALIGN
9495
LDGEMM_L4x16_LOOP_START_FIRST:
9596

9697
li T2, 512
@@ -115,7 +116,7 @@ LDGEMM_L4x16_LOOP_START_FIRST:
115116
ble LDGEMM_L4x16_LOOP_END_FIRST
116117
mtctr L
117118

118-
.align 4
119+
MY_ALIGN
119120

120121
LDGEMM_L4x16_LOOP_FIRST:
121122

@@ -132,7 +133,7 @@ LDGEMM_L4x16_LOOP_FIRST:
132133

133134
bdnz LDGEMM_L4x16_LOOP_FIRST
134135

135-
.align 4
136+
MY_ALIGN
136137

137138
LDGEMM_L4x16_LOOP_END_FIRST:
138139

@@ -175,7 +176,7 @@ LDGEMM_L4x16_SUB2_FIRST:
175176
addic. L, L, -1
176177
bgt LDGEMM_L4x16_SUB2_FIRST
177178

178-
.align 4
179+
MY_ALIGN
179180
LDGEMM_L4x16_SAVE_FIRST:
180181

181182
SAVE4x16
@@ -185,7 +186,8 @@ LDGEMM_L4x16_SAVE_FIRST:
185186

186187
LDGEMM_L4x16_END_FIRST:
187188

188-
.align 4
189+
MY_ALIGN
190+
189191
LDGEMM_L4x16_BEGIN:
190192

191193
li L, -128
@@ -222,7 +224,8 @@ LDGEMM_L4x16_BEGIN:
222224
cmpwi cr0, L, 1
223225
ble- LDGEMM_L4x16_SUB4
224226

225-
.align 4
227+
MY_ALIGN
228+
226229
LDGEMM_L4x16_LOOP_START:
227230

228231
li o40, 40
@@ -239,20 +242,19 @@ LDGEMM_L4x16_LOOP_START:
239242
ble- LDGEMM_L4x16_LOOP_END
240243
mtctr L
241244

242-
.align 4
245+
MY_ALIGN
243246

244247
LDGEMM_L4x16_LOOP:
245248

246-
247249
dcbt AO, PRE
248250
KERNEL4x16_L1
249251
dcbt AO, PRE
250-
// addic. L, L, -1
251252
KERNEL4x16_L2
252253

253254
bdnz+ LDGEMM_L4x16_LOOP
254255

255-
.align 4
256+
257+
MY_ALIGN
256258

257259
LDGEMM_L4x16_LOOP_END:
258260

@@ -261,13 +263,17 @@ LDGEMM_L4x16_LOOP_END:
261263

262264
b LDGEMM_L4x16_SUB1
263265

266+
MY_ALIGN
267+
264268
LDGEMM_L4x16_SUB4:
265269

266270
KERNEL4x16_SUBI1
267271
KERNEL4x16_SUB1
268272

269273
b LDGEMM_L4x16_SUB1
270274

275+
MY_ALIGN
276+
271277
LDGEMM_L4x16_SUB0:
272278

273279
andi. L, K, 1
@@ -278,19 +284,24 @@ LDGEMM_L4x16_SUB0:
278284
ble LDGEMM_L4x16_SAVE
279285
b LDGEMM_L4x16_SUB2
280286

287+
MY_ALIGN
288+
281289
LDGEMM_L4x16_SUB1:
282290

283291
andi. L, K, 1
284292
ble LDGEMM_L4x16_SAVE
285293

294+
MY_ALIGN
295+
286296
LDGEMM_L4x16_SUB2:
287297

288298
KERNEL4x16_SUB1
289299

290300
addic. L, L, -1
291301
bgt LDGEMM_L4x16_SUB2
292302

293-
.align 4
303+
MY_ALIGN
304+
294305
LDGEMM_L4x16_SAVE:
295306

296307
SAVE4x16
@@ -334,7 +345,7 @@ LDGEMM_L4x8_LOOP_START:
334345
addic. L, L, -2
335346
ble LDGEMM_L4x8_LOOP_END
336347

337-
.align 5
348+
MY_ALIGN
338349

339350
LDGEMM_L4x8_LOOP:
340351

@@ -441,7 +452,7 @@ LDGEMM_L4x4_LOOP_START:
441452
addic. L, L, -2
442453
ble LDGEMM_L4x4_LOOP_END
443454

444-
.align 5
455+
MY_ALIGN
445456

446457
LDGEMM_L4x4_LOOP:
447458

@@ -543,7 +554,7 @@ LDGEMM_L4x2_LOOP_START:
543554
addic. L, L, -2
544555
ble LDGEMM_L4x2_LOOP_END
545556

546-
.align 5
557+
MY_ALIGN
547558

548559
LDGEMM_L4x2_LOOP:
549560

@@ -643,7 +654,7 @@ LDGEMM_L4x1_LOOP_START:
643654
addic. L, L, -2
644655
ble LDGEMM_L4x1_LOOP_END
645656

646-
.align 5
657+
MY_ALIGN
647658

648659
LDGEMM_L4x1_LOOP:
649660

@@ -778,7 +789,7 @@ LDGEMM_L2x16_LOOP_START:
778789
addic. L, L, -2
779790
ble LDGEMM_L2x16_LOOP_END
780791

781-
.align 5
792+
MY_ALIGN
782793

783794
LDGEMM_L2x16_LOOP:
784795

@@ -907,7 +918,7 @@ LDGEMM_L2x8_LOOP_START:
907918
addic. L, L, -2
908919
ble LDGEMM_L2x8_LOOP_END
909920

910-
.align 5
921+
MY_ALIGN
911922

912923
LDGEMM_L2x8_LOOP:
913924

@@ -1011,7 +1022,7 @@ LDGEMM_L2x4_LOOP_START:
10111022
addic. L, L, -2
10121023
ble LDGEMM_L2x4_LOOP_END
10131024

1014-
.align 5
1025+
MY_ALIGN
10151026

10161027
LDGEMM_L2x4_LOOP:
10171028

@@ -1111,7 +1122,7 @@ LDGEMM_L2x2_LOOP_START:
11111122
addic. L, L, -2
11121123
ble LDGEMM_L2x2_LOOP_END
11131124

1114-
.align 5
1125+
MY_ALIGN
11151126

11161127
LDGEMM_L2x2_LOOP:
11171128

@@ -1211,7 +1222,7 @@ LDGEMM_L2x1_LOOP_START:
12111222
addic. L, L, -2
12121223
ble LDGEMM_L2x1_LOOP_END
12131224

1214-
.align 5
1225+
MY_ALIGN
12151226

12161227
LDGEMM_L2x1_LOOP:
12171228

@@ -1331,7 +1342,7 @@ LDGEMM_L1x16_LOOP_START:
13311342
addic. L, L, -2
13321343
ble LDGEMM_L1x16_LOOP_END
13331344

1334-
.align 5
1345+
MY_ALIGN
13351346

13361347
LDGEMM_L1x16_LOOP:
13371348

@@ -1460,7 +1471,7 @@ LDGEMM_L1x8_LOOP_START:
14601471
addic. L, L, -2
14611472
ble LDGEMM_L1x8_LOOP_END
14621473

1463-
.align 5
1474+
MY_ALIGN
14641475

14651476
LDGEMM_L1x8_LOOP:
14661477

@@ -1564,7 +1575,7 @@ LDGEMM_L1x4_LOOP_START:
15641575
addic. L, L, -2
15651576
ble LDGEMM_L1x4_LOOP_END
15661577

1567-
.align 5
1578+
MY_ALIGN
15681579

15691580
LDGEMM_L1x4_LOOP:
15701581

@@ -1664,7 +1675,7 @@ LDGEMM_L1x2_LOOP_START:
16641675
addic. L, L, -2
16651676
ble LDGEMM_L1x2_LOOP_END
16661677

1667-
.align 5
1678+
MY_ALIGN
16681679

16691680
LDGEMM_L1x2_LOOP:
16701681

@@ -1764,7 +1775,7 @@ LDGEMM_L1x1_LOOP_START:
17641775
addic. L, L, -2
17651776
ble LDGEMM_L1x1_LOOP_END
17661777

1767-
.align 5
1778+
MY_ALIGN
17681779

17691780
LDGEMM_L1x1_LOOP:
17701781

kernel/power/dgemm_ncopy_macros_4_power8.S

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
127127
xxpermdi vs62, vs7, vs15, 3
128128
xxpermdi vs63, vs23, vs31, 3
129129

130+
dcbt BO, PREB
130131

131132
stxvd2x vs32, o0, BO
132133
stxvd2x vs33, o16, BO
@@ -138,6 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
138139
stxvd2x vs39, o112, BO
139140
addi BO, BO, 128
140141

142+
dcbt BO, PREB
143+
141144
stxvd2x vs40, o0, BO
142145
stxvd2x vs41, o16, BO
143146
stxvd2x vs42, o32, BO
@@ -148,6 +151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
148151
stxvd2x vs47, o112, BO
149152
addi BO, BO, 128
150153

154+
dcbt BO, PREB
155+
151156
stxvd2x vs48, o0, BO
152157
stxvd2x vs49, o16, BO
153158
stxvd2x vs50, o32, BO
@@ -158,6 +163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
158163
stxvd2x vs55, o112, BO
159164
addi BO, BO, 128
160165

166+
dcbt BO, PREB
167+
161168
stxvd2x vs56, o0, BO
162169
stxvd2x vs57, o16, BO
163170
stxvd2x vs58, o32, BO

kernel/power/dgemm_tcopy_16_power8.S

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
170170
add B2, B2, B
171171
add B1, B1, B
172172

173-
li PREA, 256
173+
li PREA, 384
174174
addi PREB, M16, 128
175175

176176
li o8, 8

kernel/power/dgemm_tcopy_logic_16_power8.S

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,31 +52,31 @@ DCOPYT_L4_BEGIN:
5252
ble DCOPYT_L4x8_BEGIN
5353

5454
mr BO, B16
55+
addi T2, M16, 384
56+
mtctr J
5557

5658
.align 5
5759

5860
DCOPYT_L4x16_LOOP:
5961

60-
/*
61-
addi T1, PREB, 128
62-
addi T2, PREB, 256
63-
*/
62+
addi T1, M16, 256
63+
6464
dcbt A0, PREA
6565
dcbt A1, PREA
6666
dcbt A2, PREA
6767
dcbt A3, PREA
68-
/*
69-
dcbtst BO, M16
70-
dcbtst BO, PREB
71-
dcbtst BO, T1
72-
dcbtst BO, T2
73-
*/
68+
69+
dcbt BO, M16
70+
dcbt BO, PREB
71+
dcbt BO, T1
72+
dcbt BO, T2
73+
7474
COPY_4x16
7575

7676
add BO, BO, M16
7777

78-
addic. J, J, -1
79-
bgt DCOPYT_L4x16_LOOP
78+
// addic. J, J, -1
79+
bdnz+ DCOPYT_L4x16_LOOP
8080

8181
DCOPYT_L4x8_BEGIN:
8282

0 commit comments

Comments
 (0)