Skip to content

Commit 0a4276b

Browse files
committed
Merge pull request #837 from wernsaar/develop
updated zgemm- and ztrmm-kernel for POWER8
2 parents d4380c1 + 08bddde commit 0a4276b

10 files changed

+3673
-557
lines changed

benchmark/Makefile

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
3434
LIBVECLIB = -framework Accelerate
3535

3636
ESSL=/opt/ibm/lib
37-
LIBESSL = -lessl $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.2/lib/libxl.a
37+
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
38+
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
3839

3940
ifeq ($(OSNAME), WINNT)
4041

@@ -259,7 +260,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
259260
endif
260261

261262
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
262-
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl
263+
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
264+
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
263265

264266
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
265267
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
@@ -312,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX)
312314
slinpack.veclib : slinpack.$(SUFFIX)
313315
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
314316

317+
slinpack.essl : slinpack.$(SUFFIX)
318+
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
319+
315320
##################################### Dlinpack ####################################################
316321
dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
317322
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -328,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX)
328333
dlinpack.veclib : dlinpack.$(SUFFIX)
329334
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
330335

336+
dlinpack.essl : dlinpack.$(SUFFIX)
337+
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
338+
331339
##################################### Clinpack ####################################################
332340

333341
clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
@@ -345,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX)
345353
clinpack.veclib : clinpack.$(SUFFIX)
346354
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
347355

356+
clinpack.essl : clinpack.$(SUFFIX)
357+
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
358+
348359
##################################### Zlinpack ####################################################
349360

350361
zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
@@ -362,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX)
362373
zlinpack.veclib : zlinpack.$(SUFFIX)
363374
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
364375

376+
zlinpack.essl : zlinpack.$(SUFFIX)
377+
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
378+
365379
##################################### Scholesky ###################################################
366380

367381
scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)

common_power.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,7 @@ Lmcount$lazy_ptr:
798798
#elif defined(PPC440FP2)
799799
#define BUFFER_SIZE ( 16 << 20)
800800
#elif defined(POWER8)
801-
#define BUFFER_SIZE ( 64 << 20)
801+
#define BUFFER_SIZE ( 32 << 20)
802802
#else
803803
#define BUFFER_SIZE ( 16 << 20)
804804
#endif

kernel/power/cgemm_kernel_8x4_power8.S

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8282
#endif
8383

8484
#ifdef __64BIT__
85-
#define STACKSIZE 400
85+
#define STACKSIZE 32000
8686
#define ALPHA_R_SP 296(SP)
8787
#define ALPHA_I_SP 304(SP)
8888
#define FZERO 312(SP)
@@ -136,6 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
136136
#define alpha_sr vs30
137137
#define alpha_si vs31
138138

139+
#define FRAMEPOINTER r12
139140

140141
#define BBUFFER r14
141142
#define L r15
@@ -161,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
161162
PROLOGUE
162163
PROFCODE
163164

165+
mr FRAMEPOINTER, SP
166+
addi SP, SP, -STACKSIZE
167+
addi SP, SP, -STACKSIZE
168+
addi SP, SP, -STACKSIZE
164169
addi SP, SP, -STACKSIZE
165170
li r0, 0
166171

@@ -233,37 +238,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
233238

234239
#ifdef linux
235240
#ifdef __64BIT__
236-
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
241+
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
237242
#endif
238243
#endif
239244

240245
#if defined(_AIX) || defined(__APPLE__)
241246
#ifdef __64BIT__
242-
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
247+
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
243248
#else
244249
#ifdef DOUBLE
245-
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
246-
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
247-
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
250+
lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
251+
lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
252+
lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
248253
#else
249-
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
254+
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
250255
#endif
251256
#endif
252257
#endif
253258

254259
#ifdef TRMMKERNEL
255260
#if defined(linux) && defined(__64BIT__)
256-
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
261+
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
257262
#endif
258263

259264
#if defined(_AIX) || defined(__APPLE__)
260265
#ifdef __64BIT__
261-
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
266+
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
262267
#else
263268
#ifdef DOUBLE
264-
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
269+
lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
265270
#else
266-
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
271+
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
267272
#endif
268273
#endif
269274
#endif
@@ -290,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
290295
li o32 , 32
291296
li o48 , 48
292297

293-
li T1, 256
294-
slwi T1, T1, 9 // 131072
295-
sub BBUFFER, A, T1 // temp buffer for B unrolled
298+
addi BBUFFER, SP, 512+4096
299+
li T1, -4096
300+
and BBUFFER, BBUFFER, T1
296301

297302

298303
#ifdef __64BIT__
@@ -392,6 +397,9 @@ L999:
392397
#endif
393398

394399
addi SP, SP, STACKSIZE
400+
addi SP, SP, STACKSIZE
401+
addi SP, SP, STACKSIZE
402+
addi SP, SP, STACKSIZE
395403

396404
blr
397405

kernel/power/sgemm_kernel_16x8_power8.S

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8282
#endif
8383

8484
#ifdef __64BIT__
85-
#define STACKSIZE 340
85+
#define STACKSIZE 32752
8686
#define ALPHA_SP 296(SP)
8787
#define FZERO 304(SP)
8888
#else
@@ -132,6 +132,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
132132

133133
#define o0 0
134134

135+
#define FRAMEPOINTER r12
136+
135137
#define BBUFFER r14
136138
#define o4 r15
137139
#define o12 r16
@@ -160,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
160162
PROLOGUE
161163
PROFCODE
162164

165+
mr FRAMEPOINTER, SP
166+
addi SP, SP, -STACKSIZE
167+
addi SP, SP, -STACKSIZE
168+
addi SP, SP, -STACKSIZE
163169
addi SP, SP, -STACKSIZE
164170
li r0, 0
165171

@@ -231,25 +237,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
231237

232238
#if defined(_AIX) || defined(__APPLE__)
233239
#if !defined(__64BIT__) && defined(DOUBLE)
234-
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
240+
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
235241
#endif
236242
#endif
237243

238244
slwi LDC, LDC, 2
239245

240246
#if defined(TRMMKERNEL)
241247
#if defined(linux) && defined(__64BIT__)
242-
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
248+
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
243249
#endif
244250

245251
#if defined(_AIX) || defined(__APPLE__)
246252
#ifdef __64BIT__
247-
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
253+
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
248254
#else
249255
#ifdef DOUBLE
250-
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
256+
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
251257
#else
252-
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
258+
lwz OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
253259
#endif
254260
#endif
255261
#endif
@@ -271,9 +277,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
271277
li o32, 32
272278
li o48, 48
273279

274-
li T1, 256
275-
slwi T1, T1, 9 // 131072
276-
sub BBUFFER, A, T1 // temp buffer for B unrolled
280+
addi BBUFFER, SP, 512+4096
281+
li T1, -4096
282+
and BBUFFER, BBUFFER, T1
277283

278284
addi T1, SP, 300
279285
stxsspx f1, o0 , T1
@@ -355,6 +361,9 @@ L999:
355361
#endif
356362

357363
addi SP, SP, STACKSIZE
364+
addi SP, SP, STACKSIZE
365+
addi SP, SP, STACKSIZE
366+
addi SP, SP, STACKSIZE
358367

359368
blr
360369

0 commit comments

Comments
 (0)