Skip to content

Commit e173c51

Browse files
committed
updated zgemm- and ztrmm-kernel for POWER8
1 parent 9c42f03 commit e173c51

File tree

6 files changed

+3611
-526
lines changed

6 files changed

+3611
-526
lines changed

kernel/power/zgemm_kernel_8x2_power8.S

Lines changed: 40 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,3 @@
1-
/***************************************************************************
2-
Copyright (c) 2013-2016, The OpenBLAS Project
3-
All rights reserved.
4-
Redistribution and use in source and binary forms, with or without
5-
modification, are permitted provided that the following conditions are
6-
met:
7-
1. Redistributions of source code must retain the above copyright
8-
notice, this list of conditions and the following disclaimer.
9-
2. Redistributions in binary form must reproduce the above copyright
10-
notice, this list of conditions and the following disclaimer in
11-
the documentation and/or other materials provided with the
12-
distribution.
13-
3. Neither the name of the OpenBLAS project nor the names of
14-
its contributors may be used to endorse or promote products
15-
derived from this software without specific prior written permission.
16-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17-
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19-
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20-
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21-
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22-
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23-
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24-
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25-
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26-
*****************************************************************************/
27-
28-
/**************************************************************************************
29-
* 2016/03/05 Werner Saar ([email protected])
30-
* BLASTEST : OK
31-
* CTEST : OK
32-
* TEST : OK
33-
* LAPACK-TEST : OK
34-
**************************************************************************************/
35-
361
/*********************************************************************/
372
/* Copyright 2009, 2010 The University of Texas at Austin. */
383
/* All rights reserved. */
@@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8247
#endif
8348

8449
#ifdef __64BIT__
85-
#define STACKSIZE 320
50+
#define STACKSIZE 32000
8651
#define ALPHA_R_SP 296(SP)
8752
#define ALPHA_I_SP 304(SP)
8853
#define FZERO 312(SP)
@@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13398
#define alpha_r vs30
13499
#define alpha_i vs31
135100

101+
102+
#define FRAMEPOINTER r12
103+
104+
#define BBUFFER r14
105+
136106
#define L r15
137107
#define ALPHA r16
138108
#define o24 r17
139109
#define T2 r19
140-
#define KK r20
110+
#define BBO r20
141111
#define o8 r21
142112
#define I r22
143113
#define J r23
@@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
156126
PROLOGUE
157127
PROFCODE
158128

159-
addi SP, SP, -STACKSIZE
160-
li r0, 0
129+
mr FRAMEPOINTER, SP
130+
addi SP, SP, -STACKSIZE
131+
addi SP, SP, -STACKSIZE
132+
addi SP, SP, -STACKSIZE
133+
addi SP, SP, -STACKSIZE
134+
li r0, 0
161135

162136
stfd f14, 0(SP)
163137
stfd f15, 8(SP)
@@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
200174
std r17, 256(SP)
201175
std r16, 264(SP)
202176
std r15, 272(SP)
177+
std r14, 280(SP)
203178
#else
204179
stw r31, 144(SP)
205180
stw r30, 148(SP)
@@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
226201

227202
#ifdef linux
228203
#ifdef __64BIT__
229-
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
204+
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
230205
#endif
231206
#endif
232207

233208
#if defined(_AIX) || defined(__APPLE__)
234209
#ifdef __64BIT__
235-
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
210+
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
236211
#else
237212
#ifdef DOUBLE
238-
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
239-
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
240-
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
213+
lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER)
214+
lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER)
215+
lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
241216
#else
242-
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
217+
lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
243218
#endif
244219
#endif
245220
#endif
246221

247222
#ifdef TRMMKERNEL
248223
#if defined(linux) && defined(__64BIT__)
249-
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
224+
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
250225
#endif
251226

252227
#if defined(_AIX) || defined(__APPLE__)
253228
#ifdef __64BIT__
254-
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
229+
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
255230
#else
256231
#ifdef DOUBLE
257-
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
232+
lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER)
258233
#else
259-
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
234+
lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
260235
#endif
261236
#endif
262237
#endif
@@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
268243
#include "zgemm_macros_8x2_power8.S"
269244

270245
cmpwi cr0, M, 0
271-
ble .L999
246+
ble L999
272247
cmpwi cr0, N, 0
273-
ble .L999
248+
ble L999
274249
cmpwi cr0, K, 0
275-
ble .L999
250+
ble L999
276251

277252
slwi LDC, LDC, ZBASE_SHIFT
278-
li PRE, 256
253+
li PRE, 384
279254
li o8 , 8
280255
li o16 , 16
281256
li o24 , 24
282257
li o32 , 32
283258
li o48 , 48
284259

260+
addi BBUFFER, SP, 512+4096
261+
li T1, -4096
262+
and BBUFFER, BBUFFER, T1
263+
285264
#ifdef __64BIT__
286265
addi ALPHA, SP, 296
287266
#else
288267
addi ALPHA, SP, 224
289268
#endif
290269

291-
lxvdsx alpha_r, 0, ALPHA
292-
lxvdsx alpha_i, o8, ALPHA
270+
lxsdx alpha_r, 0, ALPHA
271+
lxsdx alpha_i, o8, ALPHA
293272

294-
.align 5
273+
.align 4
295274

296275
#include "zgemm_logic_8x2_power8.S"
297276

298-
.L999:
277+
L999:
299278
addi r3, 0, 0
300279

301280
lfd f14, 0(SP)
@@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
339318
ld r17, 256(SP)
340319
ld r16, 264(SP)
341320
ld r15, 272(SP)
321+
ld r14, 280(SP)
342322
#else
343323
lwz r31, 144(SP)
344324
lwz r30, 148(SP)
@@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
360340
#endif
361341

362342
addi SP, SP, STACKSIZE
343+
addi SP, SP, STACKSIZE
344+
addi SP, SP, STACKSIZE
345+
addi SP, SP, STACKSIZE
363346

364347
blr
365348

0 commit comments

Comments
 (0)