Skip to content

Commit 8fb5a1a

Browse files
committed
added optimized dtrsm_LT kernel for POWER8
1 parent 8a149e6 commit 8fb5a1a

File tree

4 files changed

+5633
-1
lines changed

4 files changed

+5633
-1
lines changed

kernel/power/KERNEL.POWER8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
5454
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
5555

5656
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
57-
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
57+
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
5858
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
5959
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
6060

Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
/*********************************************************************/
2+
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* All rights reserved. */
4+
/* */
5+
/* Redistribution and use in source and binary forms, with or */
6+
/* without modification, are permitted provided that the following */
7+
/* conditions are met: */
8+
/* */
9+
/* 1. Redistributions of source code must retain the above */
10+
/* copyright notice, this list of conditions and the following */
11+
/* disclaimer. */
12+
/* */
13+
/* 2. Redistributions in binary form must reproduce the above */
14+
/* copyright notice, this list of conditions and the following */
15+
/* disclaimer in the documentation and/or other materials */
16+
/* provided with the distribution. */
17+
/* */
18+
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19+
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20+
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21+
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22+
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23+
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24+
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25+
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26+
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27+
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28+
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29+
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30+
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31+
/* POSSIBILITY OF SUCH DAMAGE. */
32+
/* */
33+
/* The views and conclusions contained in the software and */
34+
/* documentation are those of the authors and should not be */
35+
/* interpreted as representing official policies, either expressed */
36+
/* or implied, of The University of Texas at Austin. */
37+
/*********************************************************************/
38+
39+
#define ASSEMBLER
40+
#include "common.h"
41+
#include "def_vsx.h"
42+
43+
#ifndef __64BIT__
44+
#define LOAD lwz
45+
#else
46+
#define LOAD ld
47+
#endif
48+
49+
#ifdef __64BIT__
50+
#define STACKSIZE 320
51+
#define ALPHA 296(SP)
52+
#define FZERO 304(SP)
53+
#else
54+
#define STACKSIZE 240
55+
#define ALPHA 224(SP)
56+
#define FZERO 232(SP)
57+
#endif
58+
59+
#define M r3
60+
#define N r4
61+
#define K r5
62+
63+
#ifdef linux
64+
#ifndef __64BIT__
65+
#define A r6
66+
#define B r7
67+
#define C r8
68+
#define LDC r9
69+
#define OFFSET r10
70+
#else
71+
#define A r7
72+
#define B r8
73+
#define C r9
74+
#define LDC r10
75+
#define OFFSET r6
76+
#endif
77+
#endif
78+
79+
#if defined(_AIX) || defined(__APPLE__)
80+
#if !defined(__64BIT__) && defined(DOUBLE)
81+
#define A r8
82+
#define B r9
83+
#define C r10
84+
#define LDC r7
85+
#define OFFSET r6
86+
#else
87+
#define A r7
88+
#define B r8
89+
#define C r9
90+
#define LDC r10
91+
#define OFFSET r6
92+
#endif
93+
#endif
94+
95+
#define o0 0
96+
97+
#define PRE r15
98+
#define T4 r16
99+
#define L r17
100+
#define T3 r18
101+
#define T2 r19
102+
#define KK r20
103+
#define I r21
104+
#define J r22
105+
#define AO r23
106+
#define BO r24
107+
#define CO r25
108+
#define o8 r26
109+
#define o16 r27
110+
#define o24 r28
111+
#define o32 r29
112+
#define o48 r30
113+
#define T1 r31
114+
115+
#include "dtrsm_macros_LT_16x4_power8.S"
116+
117+
#ifndef NEEDPARAM
118+
119+
PROLOGUE
120+
PROFCODE
121+
122+
addi SP, SP, -STACKSIZE
123+
li r0, 0
124+
125+
stfd f14, 0(SP)
126+
stfd f15, 8(SP)
127+
stfd f16, 16(SP)
128+
stfd f17, 24(SP)
129+
130+
stfd f18, 32(SP)
131+
stfd f19, 40(SP)
132+
stfd f20, 48(SP)
133+
stfd f21, 56(SP)
134+
135+
stfd f22, 64(SP)
136+
stfd f23, 72(SP)
137+
stfd f24, 80(SP)
138+
stfd f25, 88(SP)
139+
140+
stfd f26, 96(SP)
141+
stfd f27, 104(SP)
142+
stfd f28, 112(SP)
143+
stfd f29, 120(SP)
144+
145+
stfd f30, 128(SP)
146+
stfd f31, 136(SP)
147+
148+
#ifdef __64BIT__
149+
std r31, 144(SP)
150+
std r30, 152(SP)
151+
std r29, 160(SP)
152+
std r28, 168(SP)
153+
std r27, 176(SP)
154+
std r26, 184(SP)
155+
std r25, 192(SP)
156+
std r24, 200(SP)
157+
std r23, 208(SP)
158+
std r22, 216(SP)
159+
std r21, 224(SP)
160+
std r20, 232(SP)
161+
std r19, 240(SP)
162+
std r18, 248(SP)
163+
std r17, 256(SP)
164+
std r16, 264(SP)
165+
std r15, 272(SP)
166+
#else
167+
stw r31, 144(SP)
168+
stw r30, 148(SP)
169+
stw r29, 152(SP)
170+
stw r28, 156(SP)
171+
stw r27, 160(SP)
172+
stw r26, 164(SP)
173+
stw r25, 168(SP)
174+
stw r24, 172(SP)
175+
stw r23, 176(SP)
176+
stw r22, 180(SP)
177+
stw r21, 184(SP)
178+
stw r20, 188(SP)
179+
stw r19, 192(SP)
180+
stw r18, 196(SP)
181+
#endif
182+
183+
184+
#if defined(_AIX) || defined(__APPLE__)
185+
#if !defined(__64BIT__) && defined(DOUBLE)
186+
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
187+
#endif
188+
#endif
189+
190+
191+
#if defined(linux) && defined(__64BIT__)
192+
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
193+
#endif
194+
195+
#if defined(_AIX) || defined(__APPLE__)
196+
#ifdef __64BIT__
197+
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
198+
#else
199+
#ifdef DOUBLE
200+
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
201+
#else
202+
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
203+
#endif
204+
#endif
205+
#endif
206+
207+
208+
cmpwi cr0, M, 0
209+
ble L999
210+
cmpwi cr0, N, 0
211+
ble L999
212+
cmpwi cr0, K, 0
213+
ble L999
214+
215+
slwi LDC, LDC, BASE_SHIFT
216+
217+
li o8, 8
218+
li o16, 16
219+
li o24, 24
220+
li o32, 32
221+
li o48, 48
222+
223+
mr KK, OFFSET
224+
225+
#include "dtrsm_logic_LT_16x4_power8.S"
226+
227+
L999:
228+
addi r3, 0, 0
229+
230+
lfd f14, 0(SP)
231+
lfd f15, 8(SP)
232+
lfd f16, 16(SP)
233+
lfd f17, 24(SP)
234+
235+
lfd f18, 32(SP)
236+
lfd f19, 40(SP)
237+
lfd f20, 48(SP)
238+
lfd f21, 56(SP)
239+
240+
lfd f22, 64(SP)
241+
lfd f23, 72(SP)
242+
lfd f24, 80(SP)
243+
lfd f25, 88(SP)
244+
245+
lfd f26, 96(SP)
246+
lfd f27, 104(SP)
247+
lfd f28, 112(SP)
248+
lfd f29, 120(SP)
249+
250+
lfd f30, 128(SP)
251+
lfd f31, 136(SP)
252+
253+
#ifdef __64BIT__
254+
ld r31, 144(SP)
255+
ld r30, 152(SP)
256+
ld r29, 160(SP)
257+
ld r28, 168(SP)
258+
ld r27, 176(SP)
259+
ld r26, 184(SP)
260+
ld r25, 192(SP)
261+
ld r24, 200(SP)
262+
ld r23, 208(SP)
263+
ld r22, 216(SP)
264+
ld r21, 224(SP)
265+
ld r20, 232(SP)
266+
ld r19, 240(SP)
267+
ld r18, 248(SP)
268+
ld r17, 256(SP)
269+
ld r16, 264(SP)
270+
ld r15, 272(SP)
271+
#else
272+
lwz r31, 144(SP)
273+
lwz r30, 148(SP)
274+
lwz r29, 152(SP)
275+
lwz r28, 156(SP)
276+
lwz r27, 160(SP)
277+
lwz r26, 164(SP)
278+
lwz r25, 168(SP)
279+
lwz r24, 172(SP)
280+
lwz r23, 176(SP)
281+
lwz r22, 180(SP)
282+
lwz r21, 184(SP)
283+
lwz r20, 188(SP)
284+
lwz r19, 192(SP)
285+
lwz r18, 196(SP)
286+
#endif
287+
288+
addi SP, SP, STACKSIZE
289+
290+
blr
291+
292+
EPILOGUE
293+
#endif

0 commit comments

Comments
 (0)