Skip to content

Commit 88011f6

Browse files
committed
Merge pull request #876 from wernsaar/develop
optimized dgemm on power8 for 20 threads
2 parents 5faffc1 + 8310d4d commit 88011f6

File tree

5 files changed

+191
-53
lines changed

5 files changed

+191
-53
lines changed

Makefile.power

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ endif
1313

1414
ifeq ($(CORE), POWER8)
1515
ifeq ($(USE_OPENMP), 1)
16-
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp
16+
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
1717
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
1818
else
19-
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math
19+
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
2020
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
2121
endif
2222
endif

common_power.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -803,7 +803,7 @@ Lmcount$lazy_ptr:
803803
#elif defined(PPC440FP2)
804804
#define BUFFER_SIZE ( 16 << 20)
805805
#elif defined(POWER8)
806-
#define BUFFER_SIZE ( 32 << 20)
806+
#define BUFFER_SIZE ( 64 << 20)
807807
#else
808808
#define BUFFER_SIZE ( 16 << 20)
809809
#endif

kernel/power/dgemm_logic_16x4_power8.S

Lines changed: 149 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3939

4040
LDGEMM_L4_BEGIN:
4141

42-
mr CO, C
42+
li T1, 128
43+
li T2, 256
4344
mr AO, A
44-
slwi T1, LDC , 2
45-
add C, C, T1
45+
46+
mr CO, C
47+
slwi T3, LDC , 2
48+
add C, C, T3
49+
50+
dcbt A, T1
51+
dcbt A, T2
52+
4653
srawi. I, M, 4
4754
ble LDGEMM_L4x16_END
4855

56+
.align 4
57+
LDGEMM_L4x16_BEGIN_FIRST:
58+
59+
li L, -128
60+
61+
mr T1, CO
62+
add T2, T1, LDC
63+
add T3, T2, LDC
64+
add T4, T3, LDC
65+
66+
and T1, T1, L
67+
and T2, T2, L
68+
and T3, T3, L
69+
and T4, T4, L
70+
71+
dcbt T1, r0
72+
dcbt T2, r0
73+
dcbt T3, r0
74+
dcbt T4, r0
75+
76+
mr BO, B
77+
srawi. L, K, 2
78+
79+
addi T1, T1, 128
80+
addi T2, T2, 128
81+
addi T3, T3, 128
82+
addi T4, T4, 128
83+
84+
dcbt T1, r0
85+
dcbt T2, r0
86+
dcbt T3, r0
87+
dcbt T4, r0
88+
89+
ble LDGEMM_L4x16_SUB0_FIRST
90+
cmpwi cr0, L, 1
91+
ble LDGEMM_L4x16_SUB4_FIRST
92+
93+
.align 4
94+
LDGEMM_L4x16_LOOP_START_FIRST:
95+
96+
li T2, 512
97+
li o40, 40
98+
li o56, 56
99+
100+
dcbt AO, PRE
101+
dcbt BO, T2
102+
LOAD4x16_1
103+
dcbt AO, PRE
104+
KERNEL4x16_I1
105+
dcbt AO, PRE
106+
addic. L, L, -2
107+
KERNEL4x16_L2
108+
109+
dcbt AO, PRE
110+
KERNEL4x16_L1
111+
dcbt AO, PRE
112+
dcbt BO, T2
113+
KERNEL4x16_L2
114+
115+
ble LDGEMM_L4x16_LOOP_END_FIRST
116+
mtctr L
117+
118+
.align 4
119+
120+
LDGEMM_L4x16_LOOP_FIRST:
121+
122+
dcbt AO, PRE
123+
KERNEL4x16_L1
124+
dcbt AO, PRE
125+
KERNEL4x16_L2
126+
127+
dcbt AO, PRE
128+
KERNEL4x16_L1
129+
dcbt AO, PRE
130+
dcbt BO, T2
131+
KERNEL4x16_L2
132+
133+
bdnz LDGEMM_L4x16_LOOP_FIRST
134+
135+
.align 4
136+
137+
LDGEMM_L4x16_LOOP_END_FIRST:
138+
139+
KERNEL4x16_L1
140+
KERNEL4x16_L2
141+
142+
KERNEL4x16_1
143+
KERNEL4x16_E2
144+
145+
b LDGEMM_L4x16_SUB1_FIRST
146+
147+
LDGEMM_L4x16_SUB4_FIRST:
148+
149+
KERNEL4x16_SUBI1
150+
KERNEL4x16_SUB1
151+
KERNEL4x16_SUB1
152+
KERNEL4x16_SUB1
153+
154+
b LDGEMM_L4x16_SUB1_FIRST
155+
156+
LDGEMM_L4x16_SUB0_FIRST:
157+
158+
andi. L, K, 3
159+
160+
KERNEL4x16_SUBI1
161+
162+
addic. L, L, -1
163+
ble LDGEMM_L4x16_SAVE_FIRST
164+
b LDGEMM_L4x16_SUB2_FIRST
165+
166+
LDGEMM_L4x16_SUB1_FIRST:
167+
168+
andi. L, K, 3
169+
ble LDGEMM_L4x16_SAVE_FIRST
170+
171+
LDGEMM_L4x16_SUB2_FIRST:
172+
173+
KERNEL4x16_SUB1
174+
175+
addic. L, L, -1
176+
bgt LDGEMM_L4x16_SUB2_FIRST
177+
178+
.align 4
179+
LDGEMM_L4x16_SAVE_FIRST:
180+
181+
SAVE4x16
182+
183+
addic. I, I, -1
184+
ble LDGEMM_L4x16_END
185+
186+
LDGEMM_L4x16_END_FIRST:
187+
49188
.align 4
50189
LDGEMM_L4x16_BEGIN:
51190

@@ -79,9 +218,9 @@ LDGEMM_L4x16_BEGIN:
79218
dcbt T3, r0
80219
dcbt T4, r0
81220

82-
ble LDGEMM_L4x16_SUB0
221+
ble- LDGEMM_L4x16_SUB0
83222
cmpwi cr0, L, 1
84-
ble LDGEMM_L4x16_SUB4
223+
ble- LDGEMM_L4x16_SUB4
85224

86225
.align 4
87226
LDGEMM_L4x16_LOOP_START:
@@ -97,7 +236,8 @@ LDGEMM_L4x16_LOOP_START:
97236
addic. L, L, -2
98237
KERNEL4x16_L2
99238

100-
ble LDGEMM_L4x16_LOOP_END
239+
ble- LDGEMM_L4x16_LOOP_END
240+
mtctr L
101241

102242
.align 4
103243

@@ -107,10 +247,10 @@ LDGEMM_L4x16_LOOP:
107247
dcbt AO, PRE
108248
KERNEL4x16_L1
109249
dcbt AO, PRE
110-
addic. L, L, -1
250+
// addic. L, L, -1
111251
KERNEL4x16_L2
112252

113-
bgt LDGEMM_L4x16_LOOP
253+
bdnz+ LDGEMM_L4x16_LOOP
114254

115255
.align 4
116256

@@ -156,7 +296,7 @@ LDGEMM_L4x16_SAVE:
156296
SAVE4x16
157297

158298
addic. I, I, -1
159-
bgt LDGEMM_L4x16_BEGIN
299+
bgt+ LDGEMM_L4x16_BEGIN
160300

161301
LDGEMM_L4x16_END:
162302

kernel/power/dgemm_macros_16x4_power8.S

Lines changed: 36 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -559,17 +559,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
559559

560560
.macro SAVE4x16
561561

562-
mr T1, CO
563-
add T2, T1, LDC
564-
add T3, T2, LDC
565-
add T4, T3, LDC
562+
add T2, CO, LDC
566563

567564
lxvd2x vs0, 0, CO
568565
lxvd2x vs1, o16, CO
569566
lxvd2x vs2, o32, CO
570567
lxvd2x vs3, o48, CO
571568
lxvd2x vs4, o64, CO
572569
lxvd2x vs5, o80, CO
570+
add T3, T2, LDC
573571
lxvd2x vs6, o96, CO
574572
lxvd2x vs7, o112, CO
575573

@@ -579,6 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
579577
lxvd2x vs11, o48, T2
580578
lxvd2x vs12, o64, T2
581579
lxvd2x vs13, o80, T2
580+
add T4, T3, LDC
582581
lxvd2x vs14, o96, T2
583582
lxvd2x vs15, o112, T2
584583

@@ -592,80 +591,81 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
592591
lxvd2x vs31, o112, T3
593592

594593
xvmaddadp vs0, vs32, alpha_r
595-
xvmaddadp vs1, vs33, alpha_r
596-
xvmaddadp vs2, vs34, alpha_r
597-
xvmaddadp vs3, vs35, alpha_r
598-
xvmaddadp vs4, vs36, alpha_r
599-
xvmaddadp vs5, vs37, alpha_r
600-
xvmaddadp vs6, vs38, alpha_r
601-
xvmaddadp vs7, vs39, alpha_r
602-
603594
lxvd2x vs32, 0, T4
595+
xvmaddadp vs1, vs33, alpha_r
604596
lxvd2x vs33, o16, T4
597+
xvmaddadp vs2, vs34, alpha_r
605598
lxvd2x vs34, o32, T4
599+
xvmaddadp vs3, vs35, alpha_r
606600
lxvd2x vs35, o48, T4
601+
xvmaddadp vs4, vs36, alpha_r
607602
lxvd2x vs36, o64, T4
603+
xvmaddadp vs5, vs37, alpha_r
608604
lxvd2x vs37, o80, T4
605+
xvmaddadp vs6, vs38, alpha_r
609606
lxvd2x vs38, o96, T4
607+
xvmaddadp vs7, vs39, alpha_r
610608
lxvd2x vs39, o112, T4
611609

612610
xvmaddadp vs8, vs40, alpha_r
613611
xvmaddadp vs9, vs41, alpha_r
614612
xvmaddadp vs10, vs42, alpha_r
615613
xvmaddadp vs11, vs43, alpha_r
616614

617-
stxvd2x vs0, 0, T1
618-
stxvd2x vs1, o16, T1
619-
stxvd2x vs2, o32, T1
620-
stxvd2x vs3, o48, T1
621-
622615
xvmaddadp vs12, vs44, alpha_r
623616
xvmaddadp vs13, vs45, alpha_r
624617
xvmaddadp vs14, vs46, alpha_r
625618
xvmaddadp vs15, vs47, alpha_r
626619

627-
stxvd2x vs4, o64, T1
628-
stxvd2x vs5, o80, T1
629-
stxvd2x vs6, o96, T1
630-
stxvd2x vs7, o112, T1
631-
632620
xvmaddadp vs24, vs48, alpha_r
633621
xvmaddadp vs25, vs49, alpha_r
634622
xvmaddadp vs26, vs50, alpha_r
635623
xvmaddadp vs27, vs51, alpha_r
636624

637-
stxvd2x vs8, o0, T2
638-
stxvd2x vs9, o16, T2
639-
stxvd2x vs10, o32, T2
640-
stxvd2x vs11, o48, T2
641-
642625
xvmaddadp vs28, vs52, alpha_r
643626
xvmaddadp vs29, vs53, alpha_r
644627
xvmaddadp vs30, vs54, alpha_r
645628
xvmaddadp vs31, vs55, alpha_r
646629

647-
stxvd2x vs12, o64, T2
648-
stxvd2x vs13, o80, T2
649-
stxvd2x vs14, o96, T2
650-
stxvd2x vs15, o112, T2
630+
stxvd2x vs0, 0, CO
631+
stxvd2x vs1, o16, CO
632+
stxvd2x vs2, o32, CO
633+
stxvd2x vs3, o48, CO
634+
635+
stxvd2x vs4, o64, CO
636+
stxvd2x vs5, o80, CO
637+
stxvd2x vs6, o96, CO
638+
stxvd2x vs7, o112, CO
651639

652640
xvmaddadp vs32, vs56, alpha_r
653641
xvmaddadp vs33, vs57, alpha_r
654642
xvmaddadp vs34, vs58, alpha_r
655643
xvmaddadp vs35, vs59, alpha_r
656644

657-
stxvd2x vs24, 0, T3
658-
stxvd2x vs25, o16, T3
659-
stxvd2x vs26, o32, T3
660-
stxvd2x vs27, o48, T3
661-
662645
xvmaddadp vs36, vs60, alpha_r
663646
xvmaddadp vs37, vs61, alpha_r
664647
xvmaddadp vs38, vs62, alpha_r
665648
xvmaddadp vs39, vs63, alpha_r
666649

650+
addi CO, CO, 128
651+
652+
stxvd2x vs8, o0, T2
653+
stxvd2x vs9, o16, T2
654+
stxvd2x vs10, o32, T2
655+
stxvd2x vs11, o48, T2
656+
657+
stxvd2x vs12, o64, T2
658+
stxvd2x vs13, o80, T2
659+
stxvd2x vs14, o96, T2
660+
stxvd2x vs15, o112, T2
661+
662+
stxvd2x vs24, 0, T3
663+
stxvd2x vs25, o16, T3
667664
stxvd2x vs28, o64, T3
668665
stxvd2x vs29, o80, T3
666+
667+
stxvd2x vs26, o32, T3
668+
stxvd2x vs27, o48, T3
669669
stxvd2x vs30, o96, T3
670670
stxvd2x vs31, o112, T3
671671

@@ -674,8 +674,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
674674
stxvd2x vs34, o32, T4
675675
stxvd2x vs35, o48, T4
676676

677-
addi CO, CO, 128
678-
679677
stxvd2x vs36, o64, T4
680678
stxvd2x vs37, o80, T4
681679
stxvd2x vs38, o96, T4

param.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1965,8 +1965,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19651965
#define DNUMOPT 8
19661966

19671967
#define GEMM_DEFAULT_OFFSET_A 0
1968-
#define GEMM_DEFAULT_OFFSET_B 4096
1969-
#define GEMM_DEFAULT_ALIGN 0x03fffUL
1968+
#define GEMM_DEFAULT_OFFSET_B 65536
1969+
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
19701970

19711971
#define SGEMM_DEFAULT_UNROLL_M 16
19721972
#define SGEMM_DEFAULT_UNROLL_N 8
@@ -1983,7 +1983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
19831983
#define ZGEMM_DEFAULT_P 320
19841984

19851985
#define SGEMM_DEFAULT_Q 640
1986-
#define DGEMM_DEFAULT_Q 640
1986+
#define DGEMM_DEFAULT_Q 720
19871987
#define CGEMM_DEFAULT_Q 640
19881988
#define ZGEMM_DEFAULT_Q 640
19891989

0 commit comments

Comments
 (0)