Skip to content

Commit 3e3ccb9

Browse files
authored
Add ARM64 implementations of ?sum
as trivial copies of the respective ?asum kernels with the fabs calls removed
1 parent 94ab4e6 commit 3e3ccb9

File tree

3 files changed

+508
-0
lines changed

3 files changed

+508
-0
lines changed

kernel/arm64/csum.S

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
/*******************************************************************************
2+
Copyright (c) 2019, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*******************************************************************************/
27+
28+
#define ASSEMBLER
29+
#include "common.h"
30+
31+
#define N x0 /* vector length */
32+
#define X x1 /* X vector address */
33+
#define INC_X x2 /* X stride */
34+
#define I x5 /* loop variable */
35+
36+
/*******************************************************************************
37+
* Macro definitions
38+
*******************************************************************************/
39+
40+
#define REG0 wzr
41+
#define SUMF s0
42+
#define TMPF s1
43+
#define TMPVF {v1.s}[0]
44+
#define SZ 4
45+
46+
/******************************************************************************/
47+
48+
.macro KERNEL_F1
49+
ld1 {v1.2s}, [X], #8
50+
ext v2.8b, v1.8b, v1.8b, #4
51+
fadd TMPF, TMPF, s2
52+
fadd SUMF, SUMF, TMPF
53+
.endm
54+
55+
.macro KERNEL_F8
56+
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X]
57+
add X, X, #64
58+
59+
PRFM PLDL1KEEP, [X, #1024]
60+
61+
fadd v1.4s, v1.4s, v2.4s
62+
fadd v3.4s, v3.4s, v4.4s
63+
fadd v0.4s, v0.4s, v1.4s
64+
fadd v0.4s, v0.4s, v3.4s
65+
.endm
66+
67+
.macro KERNEL_F8_FINALIZE
68+
ext v1.16b, v0.16b, v0.16b, #8
69+
fadd v0.2s, v0.2s, v1.2s
70+
faddp SUMF, v0.2s
71+
.endm
72+
73+
.macro INIT_S
74+
lsl INC_X, INC_X, #3
75+
.endm
76+
77+
.macro KERNEL_S1
78+
ld1 {v1.2s}, [X], INC_X
79+
ext v2.8b, v1.8b, v1.8b, #4
80+
fadd TMPF, TMPF, s2
81+
fadd SUMF, SUMF, TMPF
82+
83+
.endm
84+
85+
/*******************************************************************************
86+
* End of macro definitions
87+
*******************************************************************************/
88+
89+
PROLOGUE
90+
91+
fmov SUMF, REG0
92+
fmov s1, SUMF
93+
94+
cmp N, xzr
95+
ble .Lcsum_kernel_L999
96+
cmp INC_X, xzr
97+
ble .Lcsum_kernel_L999
98+
99+
cmp INC_X, #1
100+
bne .Lcsum_kernel_S_BEGIN
101+
102+
.Lcsum_kernel_F_BEGIN:
103+
104+
asr I, N, #3
105+
cmp I, xzr
106+
beq .Lcsum_kernel_F1
107+
108+
.Lcsum_kernel_F8:
109+
110+
KERNEL_F8
111+
112+
subs I, I, #1
113+
bne .Lcsum_kernel_F8
114+
115+
KERNEL_F8_FINALIZE
116+
117+
.Lcsum_kernel_F1:
118+
119+
ands I, N, #7
120+
ble .Lcsum_kernel_L999
121+
122+
.Lcsum_kernel_F10:
123+
124+
KERNEL_F1
125+
126+
subs I, I, #1
127+
bne .Lcsum_kernel_F10
128+
129+
.Lcsum_kernel_L999:
130+
ret
131+
132+
.Lcsum_kernel_S_BEGIN:
133+
134+
INIT_S
135+
136+
asr I, N, #2
137+
cmp I, xzr
138+
ble .Lcsum_kernel_S1
139+
140+
.Lcsum_kernel_S4:
141+
142+
KERNEL_S1
143+
KERNEL_S1
144+
KERNEL_S1
145+
KERNEL_S1
146+
147+
subs I, I, #1
148+
bne .Lcsum_kernel_S4
149+
150+
.Lcsum_kernel_S1:
151+
152+
ands I, N, #3
153+
ble .Lcsum_kernel_L999
154+
155+
.Lcsum_kernel_S10:
156+
157+
KERNEL_S1
158+
159+
subs I, I, #1
160+
bne .Lcsum_kernel_S10
161+
162+
ret
163+
164+
EPILOGUE

kernel/arm64/sum.S

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
/*******************************************************************************
2+
Copyright (c) 2019, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*******************************************************************************/
27+
28+
#define ASSEMBLER
29+
#include "common.h"
30+
31+
#define N x0 /* vector length */
32+
#define X x1 /* X vector address */
33+
#define INC_X x2 /* X stride */
34+
#define I x5 /* loop variable */
35+
36+
/*******************************************************************************
37+
* Macro definitions
38+
*******************************************************************************/
39+
40+
#if !defined(DOUBLE)
41+
#define REG0 wzr
42+
#define SUMF s0
43+
#define TMPF s1
44+
#define TMPVF {v1.s}[0]
45+
#define SZ 4
46+
#else
47+
#define REG0 xzr
48+
#define SUMF d0
49+
#define TMPF d1
50+
#define TMPVF {v1.d}[0]
51+
#define SZ 8
52+
#endif
53+
54+
/******************************************************************************/
55+
56+
.macro KERNEL_F1
57+
ldr TMPF, [X], #SZ
58+
fadd SUMF, SUMF, TMPF
59+
.endm
60+
61+
.macro KERNEL_F8
62+
#if !defined(DOUBLE)
63+
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0]
64+
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0]
65+
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0]
66+
PRFM PLDL1KEEP, [X, #1024]
67+
#else // DOUBLE
68+
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X]
69+
add X, X, #64
70+
71+
PRFM PLDL1KEEP, [X, #1024]
72+
73+
fadd v2.2d, v2.2d, v3.2d
74+
fadd v4.2d, v4.2d, v5.2d
75+
fadd v0.2d, v0.2d, v2.2d
76+
fadd v0.2d, v0.2d, v4.2d
77+
#endif
78+
.endm
79+
80+
.macro KERNEL_F8_FINALIZE
81+
#if !defined(DOUBLE)
82+
ext v1.16b, v0.16b, v0.16b, #8
83+
fadd v0.2s, v0.2s, v1.2s
84+
faddp SUMF, v0.2s
85+
#else
86+
faddp SUMF, v0.2d
87+
#endif
88+
.endm
89+
90+
.macro INIT_S
91+
#if !defined(DOUBLE)
92+
lsl INC_X, INC_X, #2
93+
#else
94+
lsl INC_X, INC_X, #3
95+
#endif
96+
.endm
97+
98+
.macro KERNEL_S1
99+
ld1 TMPVF, [X], INC_X
100+
fadd SUMF, SUMF, TMPF
101+
.endm
102+
103+
/*******************************************************************************
104+
* End of macro definitions
105+
*******************************************************************************/
106+
107+
PROLOGUE
108+
109+
fmov SUMF, REG0
110+
#if !defined(DOUBLE)
111+
fmov s1, SUMF
112+
#else
113+
fmov d1, SUMF
114+
#endif
115+
116+
cmp N, xzr
117+
ble .Lsum_kernel_L999
118+
cmp INC_X, xzr
119+
ble .Lsum_kernel_L999
120+
121+
cmp INC_X, #1
122+
bne .Lsum_kernel_S_BEGIN
123+
124+
.Lsum_kernel_F_BEGIN:
125+
126+
asr I, N, #3
127+
cmp I, xzr
128+
beq .Lsum_kernel_F1
129+
130+
.Lsum_kernel_F8:
131+
132+
KERNEL_F8
133+
134+
subs I, I, #1
135+
bne .Lsum_kernel_F8
136+
137+
KERNEL_F8_FINALIZE
138+
139+
.Lsum_kernel_F1:
140+
141+
ands I, N, #7
142+
ble .Lsum_kernel_L999
143+
144+
.Lsum_kernel_F10:
145+
146+
KERNEL_F1
147+
148+
subs I, I, #1
149+
bne .Lsum_kernel_F10
150+
151+
.Lsum_kernel_L999:
152+
ret
153+
154+
.Lsum_kernel_S_BEGIN:
155+
156+
INIT_S
157+
158+
asr I, N, #2
159+
cmp I, xzr
160+
ble .Lsum_kernel_S1
161+
162+
.Lsum_kernel_S4:
163+
164+
KERNEL_S1
165+
KERNEL_S1
166+
KERNEL_S1
167+
KERNEL_S1
168+
169+
subs I, I, #1
170+
bne .Lsum_kernel_S4
171+
172+
.Lsum_kernel_S1:
173+
174+
ands I, N, #3
175+
ble .Lsum_kernel_L999
176+
177+
.Lsum_kernel_S10:
178+
179+
KERNEL_S1
180+
181+
subs I, I, #1
182+
bne .Lsum_kernel_S10
183+
184+
ret
185+
186+
EPILOGUE

0 commit comments

Comments
 (0)