Skip to content

Commit 2216717

Browse files
authored
Merge pull request #1477 from quickwritereader/develop
Power8 blas3 copy-pack routines
2 parents 69d9f36 + 2c0a008 commit 2216717

10 files changed

+447
-749
lines changed

kernel/power/cgemm_tcopy_8_power8.S

Lines changed: 37 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
110110

111111
#include "cgemm_tcopy_macros_8_power8.S"
112112

113-
#define STACKSIZE 576
113+
#define STACKSIZE 144
114114

115115

116116
PROLOGUE
@@ -119,49 +119,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
119119
addi SP, SP, -STACKSIZE
120120
li r0, 0
121121

122-
std r31, 144(SP)
123-
std r30, 152(SP)
124-
std r29, 160(SP)
125-
std r28, 168(SP)
126-
std r27, 176(SP)
127-
std r26, 184(SP)
128-
std r25, 192(SP)
129-
std r24, 200(SP)
130-
std r23, 208(SP)
131-
std r22, 216(SP)
132-
std r21, 224(SP)
133-
std r20, 232(SP)
134-
std r19, 240(SP)
135-
std r18, 248(SP)
136-
std r17, 256(SP)
137-
std r16, 264(SP)
138-
std r15, 272(SP)
139-
std r14, 280(SP)
140-
addi r11, SP, 288
141-
stvx v20, r11, r0
142-
addi r11, r11, 16
143-
stvx v21, r11, r0
144-
addi r11, r11, 16
145-
stvx v22, r11, r0
146-
addi r11, r11, 16
147-
stvx v23, r11, r0
148-
addi r11, r11, 16
149-
stvx v24, r11, r0
150-
addi r11, r11, 16
151-
stvx v25, r11, r0
152-
addi r11, r11, 16
153-
stvx v26, r11, r0
154-
addi r11, r11, 16
155-
stvx v27, r11, r0
156-
addi r11, r11, 16
157-
stvx v28, r11, r0
158-
addi r11, r11, 16
159-
stvx v29, r11, r0
160-
addi r11, r11, 16
161-
stvx v30, r11, r0
162-
addi r11, r11, 16
163-
stvx v31, r11, r0
164-
li r11, 0
122+
std r14, 0(SP)
123+
std r15, 8(SP)
124+
std r16, 16(SP)
125+
std r17, 24(SP)
126+
std r18, 32(SP)
127+
std r19, 40(SP)
128+
std r20, 48(SP)
129+
std r21, 56(SP)
130+
std r22, 64(SP)
131+
std r23, 72(SP)
132+
std r24, 80(SP)
133+
std r25, 88(SP)
134+
std r26, 96(SP)
135+
std r27, 104(SP)
136+
std r28, 112(SP)
137+
std r29, 120(SP)
138+
std r30, 128(SP)
139+
std r31, 136(SP)
165140

166141
cmpwi cr0, M, 0
167142
ble- L999
@@ -203,51 +178,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
203178

204179
L999:
205180

206-
li r3, 0
207-
208-
ld r31, 144(SP)
209-
ld r30, 152(SP)
210-
ld r29, 160(SP)
211-
ld r28, 168(SP)
212-
ld r27, 176(SP)
213-
ld r26, 184(SP)
214-
ld r25, 192(SP)
215-
ld r24, 200(SP)
216-
ld r23, 208(SP)
217-
ld r22, 216(SP)
218-
ld r21, 224(SP)
219-
ld r20, 232(SP)
220-
ld r19, 240(SP)
221-
ld r18, 248(SP)
222-
ld r17, 256(SP)
223-
ld r16, 264(SP)
224-
ld r15, 272(SP)
225-
ld r14, 280(SP)
226-
addi r11, SP, 288
227-
lvx v20, r11, r3
228-
addi r11, r11, 16
229-
lvx v21, r11, r3
230-
addi r11, r11, 16
231-
lvx v22, r11, r3
232-
addi r11, r11, 16
233-
lvx v23, r11, r3
234-
addi r11, r11, 16
235-
lvx v24, r11, r3
236-
addi r11, r11, 16
237-
lvx v25, r11, r3
238-
addi r11, r11, 16
239-
lvx v26, r11, r3
240-
addi r11, r11, 16
241-
lvx v27, r11, r3
242-
addi r11, r11, 16
243-
lvx v28, r11, r3
244-
addi r11, r11, 16
245-
lvx v29, r11, r3
246-
addi r11, r11, 16
247-
lvx v30, r11, r3
248-
addi r11, r11, 16
249-
lvx v31, r11, r3
250-
li r11, 0
181+
ld r14, 0(SP)
182+
ld r15, 8(SP)
183+
ld r16, 16(SP)
184+
ld r17, 24(SP)
185+
ld r18, 32(SP)
186+
ld r19, 40(SP)
187+
ld r20, 48(SP)
188+
ld r21, 56(SP)
189+
ld r22, 64(SP)
190+
ld r23, 72(SP)
191+
ld r24, 80(SP)
192+
ld r25, 88(SP)
193+
ld r26, 96(SP)
194+
ld r27, 104(SP)
195+
ld r28, 112(SP)
196+
ld r29, 120(SP)
197+
ld r30, 128(SP)
198+
ld r31, 136(SP)
251199

252200
addi SP, SP, STACKSIZE
253201
blr

kernel/power/dgemm_ncopy_4_power8.S

Lines changed: 41 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -109,80 +109,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
109109

110110
#include "dgemm_ncopy_macros_4_power8.S"
111111

112-
#define STACKSIZE 384
113-
#define STACKSIZE 576
112+
#define STACKSIZE 144
113+
114114

115115
PROLOGUE
116116
PROFCODE
117117

118118
addi SP, SP, -STACKSIZE
119-
//addi SP, SP, -208
119+
120120
li r0, 0
121121

122-
stfd f14, 0(SP)
123-
stfd f15, 8(SP)
124-
stfd f16, 16(SP)
125-
stfd f17, 24(SP)
126-
stfd f18, 32(SP)
127-
stfd f19, 40(SP)
128-
stfd f20, 48(SP)
129-
stfd f21, 56(SP)
130-
stfd f22, 64(SP)
131-
stfd f23, 72(SP)
132-
stfd f24, 80(SP)
133-
stfd f25, 88(SP)
134-
stfd f26, 96(SP)
135-
stfd f27, 104(SP)
136-
stfd f28, 112(SP)
137-
stfd f29, 120(SP)
138-
stfd f30, 128(SP)
139-
stfd f31, 136(SP)
140-
141-
142-
std r31, 144(SP)
143-
std r30, 152(SP)
144-
std r29, 160(SP)
145-
std r28, 168(SP)
146-
std r27, 176(SP)
147-
std r26, 184(SP)
148-
std r25, 192(SP)
149-
std r24, 200(SP)
150-
std r23, 208(SP)
151-
std r22, 216(SP)
152-
std r21, 224(SP)
153-
std r20, 232(SP)
154-
std r19, 240(SP)
155-
std r18, 248(SP)
156-
std r17, 256(SP)
157-
std r16, 264(SP)
158-
std r15, 272(SP)
159-
std r14, 280(SP)
122+
std r14, 0(SP)
123+
std r15, 8(SP)
124+
std r16, 16(SP)
125+
std r17, 24(SP)
126+
std r18, 32(SP)
127+
std r19, 40(SP)
128+
std r20, 48(SP)
129+
std r21, 56(SP)
130+
std r22, 64(SP)
131+
std r23, 72(SP)
132+
std r24, 80(SP)
133+
std r25, 88(SP)
134+
std r26, 96(SP)
135+
std r27, 104(SP)
136+
std r28, 112(SP)
137+
std r29, 120(SP)
138+
std r30, 128(SP)
139+
std r31, 136(SP)
160140

161-
addi r11,SP,288
162-
stvx v20, r11,r0
163-
addi r11,r11,16
164-
stvx v21, r11,r0
165-
addi r11,r11,16
166-
stvx v22, r11,r0
167-
addi r11,r11,16
168-
stvx v23, r11,r0
169-
addi r11,r11,16
170-
stvx v24, r11,r0
171-
addi r11,r11,16
172-
stvx v25, r11,r0
173-
addi r11,r11,16
174-
stvx v26, r11,r0
175-
addi r11,r11,16
176-
stvx v27, r11,r0
177-
addi r11,r11,16
178-
stvx v28, r11,r0
179-
addi r11,r11,16
180-
stvx v29, r11,r0
181-
addi r11,r11,16
182-
stvx v30, r11,r0
183-
addi r11,r11,16
184-
stvx v31, r11,r0
185-
li r11,0
186141

187142
cmpwi cr0, M, 0
188143
ble- L999
@@ -191,10 +146,8 @@ li r11,0
191146

192147
slwi LDA, LDA, BASE_SHIFT
193148

194-
//li PREA, 384
195-
//li PREB, 384
196-
li PREA, 576
197-
li PREB, 576
149+
li PREA, 384
150+
li PREB, 384
198151

199152

200153
li o8, 8
@@ -210,70 +163,24 @@ li r11,0
210163

211164
L999:
212165

213-
li r3, 0
214-
215-
lfd f14, 0(SP)
216-
lfd f15, 8(SP)
217-
lfd f16, 16(SP)
218-
lfd f17, 24(SP)
219-
lfd f18, 32(SP)
220-
lfd f19, 40(SP)
221-
lfd f20, 48(SP)
222-
lfd f21, 56(SP)
223-
lfd f22, 64(SP)
224-
lfd f23, 72(SP)
225-
lfd f24, 80(SP)
226-
lfd f25, 88(SP)
227-
lfd f26, 96(SP)
228-
lfd f27, 104(SP)
229-
lfd f28, 112(SP)
230-
lfd f29, 120(SP)
231-
lfd f30, 128(SP)
232-
lfd f31, 136(SP)
233-
234-
ld r31, 144(SP)
235-
ld r30, 152(SP)
236-
ld r29, 160(SP)
237-
ld r28, 168(SP)
238-
ld r27, 176(SP)
239-
ld r26, 184(SP)
240-
ld r25, 192(SP)
241-
ld r24, 200(SP)
242-
ld r23, 208(SP)
243-
ld r22, 216(SP)
244-
ld r21, 224(SP)
245-
ld r20, 232(SP)
246-
ld r19, 240(SP)
247-
ld r18, 248(SP)
248-
ld r17, 256(SP)
249-
ld r16, 264(SP)
250-
ld r15, 272(SP)
251-
ld r14, 280(SP)
252-
addi r11,SP,288
253-
lvx v20, r11,r3
254-
addi r11,r11,16
255-
lvx v21, r11,r3
256-
addi r11,r11,16
257-
lvx v22, r11,r3
258-
addi r11,r11,16
259-
lvx v23, r11,r3
260-
addi r11,r11,16
261-
lvx v24, r11,r3
262-
addi r11,r11,16
263-
lvx v25, r11,r3
264-
addi r11,r11,16
265-
lvx v26, r11,r3
266-
addi r11,r11,16
267-
lvx v27, r11,r3
268-
addi r11,r11,16
269-
lvx v28, r11,r3
270-
addi r11,r11,16
271-
lvx v29, r11,r3
272-
addi r11,r11,16
273-
lvx v30, r11,r3
274-
addi r11,r11,16
275-
lvx v31, r11,r3
276-
li r11,0
166+
ld r14, 0(SP)
167+
ld r15, 8(SP)
168+
ld r16, 16(SP)
169+
ld r17, 24(SP)
170+
ld r18, 32(SP)
171+
ld r19, 40(SP)
172+
ld r20, 48(SP)
173+
ld r21, 56(SP)
174+
ld r22, 64(SP)
175+
ld r23, 72(SP)
176+
ld r24, 80(SP)
177+
ld r25, 88(SP)
178+
ld r26, 96(SP)
179+
ld r27, 104(SP)
180+
ld r28, 112(SP)
181+
ld r29, 120(SP)
182+
ld r30, 128(SP)
183+
ld r31, 136(SP)
277184

278185
addi SP, SP, STACKSIZE
279186
//addi SP, SP, 208

0 commit comments

Comments
 (0)