Skip to content

Commit f8b82bc

Browse files
authored
Add ia64 implementation of ?sum
as trivial copy of asum with the fabs calls removed
1 parent 3e3ccb9 commit f8b82bc

File tree

2 files changed

+362
-0
lines changed

2 files changed

+362
-0
lines changed

kernel/ia64/KERNEL

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ CASUMKERNEL = asum.S
6060
ZASUMKERNEL = asum.S
6161
XASUMKERNEL = asum.S
6262

63+
CSUMKERNEL = sum.S
64+
ZSUMKERNEL = sum.S
65+
XSUMKERNEL = sum.S
66+
6367
CNRM2KERNEL = nrm2.S
6468
ZNRM2KERNEL = nrm2.S
6569
XNRM2KERNEL = nrm2.S

kernel/ia64/sum.S

Lines changed: 358 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,358 @@
1+
/*********************************************************************/
2+
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* Copyright 2019, The OpenBLAS project */
4+
/* All rights reserved. */
5+
/* */
6+
/* Redistribution and use in source and binary forms, with or */
7+
/* without modification, are permitted provided that the following */
8+
/* conditions are met: */
9+
/* */
10+
/* 1. Redistributions of source code must retain the above */
11+
/* copyright notice, this list of conditions and the following */
12+
/* disclaimer. */
13+
/* */
14+
/* 2. Redistributions in binary form must reproduce the above */
15+
/* copyright notice, this list of conditions and the following */
16+
/* disclaimer in the documentation and/or other materials */
17+
/* provided with the distribution. */
18+
/* */
19+
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
20+
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
21+
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
22+
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
23+
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
24+
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
25+
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
26+
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
27+
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
28+
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
29+
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
30+
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
31+
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
32+
/* POSSIBILITY OF SUCH DAMAGE. */
33+
/* */
34+
/* The views and conclusions contained in the software and */
35+
/* documentation are those of the authors and should not be */
36+
/* interpreted as representing official policies, either expressed */
37+
/* or implied, of The University of Texas at Austin. */
38+
/*********************************************************************/
39+
40+
#define ASSEMBLER
41+
#include "common.h"
42+
43+
#ifdef XDOUBLE
44+
#define PREFETCH_SIZE ( 8 * 16 + 4)
45+
#elif defined(DOUBLE)
46+
#define PREFETCH_SIZE (16 * 16 + 8)
47+
#else
48+
#define PREFETCH_SIZE (32 * 16 + 16)
49+
#endif
50+
51+
#ifndef COMPLEX
52+
#define COMPADD 0
53+
#define STRIDE INCX
54+
#else
55+
#define COMPADD 1
56+
#define STRIDE SIZE
57+
#endif
58+
59+
#define PRE1 r2
60+
61+
#define I r17
62+
#define J r18
63+
#define INCX16 r21
64+
65+
#define PR r30
66+
#define ARLC r31
67+
68+
#define N r32
69+
#define X r33
70+
#define INCX r34
71+
72+
73+
PROLOGUE
74+
.prologue
75+
PROFCODE
76+
{ .mfi
77+
adds PRE1 = PREFETCH_SIZE * SIZE, X
78+
mov f8 = f0
79+
.save ar.lc, ARLC
80+
mov ARLC = ar.lc
81+
}
82+
;;
83+
.body
84+
#ifdef F_INTERFACE
85+
{ .mmi
86+
LDINT N = [N]
87+
LDINT INCX = [INCX]
88+
nop.i 0
89+
}
90+
;;
91+
#ifndef USE64BITINT
92+
{ .mii
93+
nop.m 0
94+
sxt4 N = N
95+
sxt4 INCX = INCX
96+
}
97+
;;
98+
#endif
99+
#endif
100+
{ .mmi
101+
cmp.lt p0, p6 = r0, INCX
102+
cmp.lt p0, p7 = r0, N
103+
shr I = N, (4 - COMPADD)
104+
}
105+
{ .mbb
106+
and J = ((1 << (4 - COMPADD)) - 1), N
107+
(p6) br.ret.sptk.many b0
108+
(p7) br.ret.sptk.many b0
109+
}
110+
;;
111+
{ .mfi
112+
adds I = -1, I
113+
mov f10 = f0
114+
mov PR = pr
115+
}
116+
{ .mfi
117+
cmp.eq p9, p0 = r0, J
118+
mov f9 = f0
119+
tbit.z p0, p12 = N, 3 - COMPADD
120+
}
121+
;;
122+
{ .mmi
123+
cmp.eq p16, p0 = r0, r0
124+
cmp.ne p17, p0 = r0, r0
125+
mov ar.ec= 3
126+
}
127+
{ .mfi
128+
cmp.ne p18, p0 = r0, r0
129+
mov f11 = f0
130+
shl INCX = INCX, BASE_SHIFT + COMPADD
131+
}
132+
;;
133+
{ .mmi
134+
#ifdef XDOUBLE
135+
shladd INCX16 = INCX, (3 - COMPADD), r0
136+
#else
137+
shladd INCX16 = INCX, (4 - COMPADD), r0
138+
#endif
139+
cmp.ne p19, p0 = r0, r0
140+
mov ar.lc = I
141+
}
142+
{ .mmb
143+
cmp.gt p8 ,p0 = r0, I
144+
#ifdef COMPLEX
145+
adds INCX = - SIZE, INCX
146+
#else
147+
nop.m 0
148+
#endif
149+
(p8) br.cond.dpnt .L55
150+
}
151+
;;
152+
.align 32
153+
154+
.L52:
155+
{ .mmf
156+
(p16) lfetch.nt1 [PRE1], INCX16
157+
(p16) LDFD f32 = [X], STRIDE
158+
}
159+
{ .mfb
160+
(p19) FADD f8 = f8, f71
161+
}
162+
;;
163+
{ .mmf
164+
(p16) LDFD f35 = [X], INCX
165+
}
166+
{ .mfb
167+
(p19) FADD f9 = f9, f74
168+
}
169+
;;
170+
{ .mmf
171+
(p16) LDFD f38 = [X], STRIDE
172+
}
173+
{ .mfb
174+
(p19) FADD f10 = f10, f77
175+
}
176+
;;
177+
{ .mmf
178+
(p16) LDFD f41 = [X], INCX
179+
}
180+
{ .mfb
181+
(p19) FADD f11 = f11, f80
182+
}
183+
;;
184+
{ .mmf
185+
(p16) LDFD f44 = [X], STRIDE
186+
}
187+
{ .mfb
188+
(p18) FADD f8 = f8, f34
189+
}
190+
;;
191+
{ .mmf
192+
(p16) LDFD f47 = [X], INCX
193+
}
194+
{ .mfb
195+
(p18) FADD f9 = f9, f37
196+
}
197+
;;
198+
{ .mmf
199+
(p16) LDFD f50 = [X], STRIDE
200+
}
201+
{ .mfb
202+
(p18) FADD f10 = f10, f40
203+
}
204+
;;
205+
{ .mmf
206+
(p16) LDFD f53 = [X], INCX
207+
}
208+
{ .mfb
209+
(p18) FADD f11 = f11, f43
210+
}
211+
;;
212+
{ .mmf
213+
#ifdef XDOUBLE
214+
(p16) lfetch.nt1 [PRE1], INCX16
215+
#endif
216+
(p16) LDFD f56 = [X], STRIDE
217+
}
218+
{ .mfb
219+
(p18) FADD f8 = f8, f46
220+
}
221+
;;
222+
{ .mmf
223+
(p16) LDFD f59 = [X], INCX
224+
}
225+
{ .mfb
226+
(p18) FADD f9 = f9, f49
227+
}
228+
;;
229+
{ .mmf
230+
(p16) LDFD f62 = [X], STRIDE
231+
}
232+
{ .mfb
233+
(p18) FADD f10 = f10, f52
234+
}
235+
;;
236+
{ .mmf
237+
(p16) LDFD f65 = [X], INCX
238+
}
239+
{ .mfb
240+
(p18) FADD f11 = f11, f55
241+
}
242+
;;
243+
{ .mmf
244+
(p16) LDFD f68 = [X], STRIDE
245+
}
246+
{ .mfb
247+
(p18) FADD f8 = f8, f58
248+
}
249+
;;
250+
{ .mmf
251+
(p16) LDFD f71 = [X], INCX
252+
}
253+
{ .mfb
254+
(p18) FADD f9 = f9, f61
255+
}
256+
;;
257+
{ .mmf
258+
(p16) LDFD f74 = [X], STRIDE
259+
}
260+
{ .mfb
261+
(p18) FADD f10 = f10, f64
262+
}
263+
;;
264+
{ .mmf
265+
(p16) LDFD f77 = [X], INCX
266+
}
267+
{ .mfb
268+
(p18) FADD f11 = f11, f67
269+
br.ctop.sptk.few .L52
270+
}
271+
;;
272+
FADD f8 = f8, f71
273+
FADD f9 = f9, f74
274+
FADD f10 = f10, f77
275+
FADD f11 = f11, f80
276+
.align 32
277+
;;
278+
.L55:
279+
(p12) LDFD f32 = [X], STRIDE
280+
(p9) br.cond.dptk .L998
281+
;;
282+
(p12) LDFD f33 = [X], INCX
283+
;;
284+
(p12) LDFD f34 = [X], STRIDE
285+
;;
286+
(p12) LDFD f35 = [X], INCX
287+
tbit.z p0, p13 = N, (2 - COMPADD)
288+
;;
289+
(p12) LDFD f36 = [X], STRIDE
290+
tbit.z p0, p14 = N, (1 - COMPADD)
291+
;;
292+
(p12) LDFD f37 = [X], INCX
293+
#ifndef COMPLEX
294+
tbit.z p0, p15 = N, 0
295+
#endif
296+
;;
297+
(p12) LDFD f38 = [X], STRIDE
298+
;;
299+
(p12) LDFD f39 = [X], INCX
300+
;;
301+
(p13) LDFD f40 = [X], STRIDE
302+
;;
303+
(p13) LDFD f41 = [X], INCX
304+
;;
305+
(p13) LDFD f42 = [X], STRIDE
306+
(p12) FADD f8 = f8, f32
307+
;;
308+
(p13) LDFD f43 = [X], INCX
309+
(p12) FADD f9 = f9, f33
310+
;;
311+
(p14) LDFD f44 = [X], STRIDE
312+
(p12) FADD f10 = f10, f34
313+
;;
314+
(p14) LDFD f45 = [X], INCX
315+
(p12) FADD f11 = f11, f35
316+
;;
317+
#ifndef COMPLEX
318+
(p15) LDFD f46 = [X]
319+
#endif
320+
(p12) FADD f8 = f8, f36
321+
;;
322+
(p12) FADD f9 = f9, f37
323+
(p12) FADD f10 = f10, f38
324+
(p12) FADD f11 = f11, f39
325+
;;
326+
(p13) FADD f8 = f8, f40
327+
(p13) FADD f9 = f9, f41
328+
#ifndef COMPLEX
329+
#endif
330+
(p13) FADD f10 = f10, f42
331+
;;
332+
(p13) FADD f11 = f11, f43
333+
(p14) FADD f8 = f8, f44
334+
(p14) FADD f9 = f9, f45
335+
#ifndef COMPLEX
336+
(p15) FADD f10 = f10, f46
337+
#endif
338+
;;
339+
.align 32
340+
341+
.L998:
342+
{ .mfi
343+
FADD f8 = f8, f9
344+
mov ar.lc = ARLC
345+
}
346+
{ .mmf
347+
FADD f10 = f10, f11
348+
}
349+
;;
350+
{ .mii
351+
mov pr = PR, -65474
352+
}
353+
;;
354+
{ .mfb
355+
FADD f8 = f8, f10
356+
br.ret.sptk.many b0
357+
}
358+
EPILOGUE

0 commit comments

Comments
 (0)