Skip to content

Commit 06fd5b5

Browse files
CNClareChenyinshiyou
authored andcommitted
loongarch64: Add and Refine asum optimization functions.
1 parent e771be1 commit 06fd5b5

File tree

10 files changed

+1210
-615
lines changed

10 files changed

+1210
-615
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,10 @@ DAXPBYKERNEL = daxpby_lsx.S
4949
SSUMKERNEL = sum_lsx.S
5050
DSUMKERNEL = sum_lsx.S
5151

52-
SASUMKERNEL = sasum_lsx.S
53-
DASUMKERNEL = dasum_lsx.S
52+
SASUMKERNEL = asum_lsx.S
53+
DASUMKERNEL = asum_lsx.S
54+
CASUMKERNEL = casum_lsx.S
55+
ZASUMKERNEL = casum_lsx.S
5456

5557
SROTKERNEL = rot_lsx.S
5658
DROTKERNEL = rot_lsx.S

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,10 @@ DAXPBYKERNEL = daxpby_lasx.S
4949
SSUMKERNEL = sum_lasx.S
5050
DSUMKERNEL = sum_lasx.S
5151

52-
SASUMKERNEL = sasum_lasx.S
53-
DASUMKERNEL = dasum_lasx.S
52+
SASUMKERNEL = asum_lasx.S
53+
DASUMKERNEL = asum_lasx.S
54+
CASUMKERNEL = casum_lasx.S
55+
ZASUMKERNEL = casum_lasx.S
5456

5557
SROTKERNEL = rot_lasx.S
5658
DROTKERNEL = rot_lasx.S

kernel/loongarch64/asum_lasx.S

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
/***************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
#include "common.h"
30+
31+
#define N $r4
32+
#define X $r5
33+
#define INCX $r6
34+
#define I $r17
35+
#define TEMP $r18
36+
#define t1 $r15
37+
#define t2 $r12
38+
#define t3 $r13
39+
#define t4 $r14
40+
#define VX0 $xr12
41+
#define VX1 $xr13
42+
#define VX2 $xr14
43+
#define VX3 $xr15
44+
#define VT0 $xr23
45+
#define VT1 $xr22
46+
#define res1 $xr16
47+
#define res2 $xr17
48+
#define res0 $xr18
49+
#define neg1 $xr19
50+
51+
PROLOGUE
52+
xvxor.v res1, res1, res1
53+
xvxor.v res2, res2, res2
54+
xvxor.v res0, res0, res0
55+
bge $r0, N, .L999
56+
bge $r0, INCX, .L999
57+
#ifdef DOUBLE
58+
li.d t1, -1
59+
xvreplgr2vr.d neg1, t1
60+
xvffint.d.l neg1, neg1
61+
#else
62+
li.w t1, -1
63+
xvreplgr2vr.w neg1, t1
64+
xvffint.s.w neg1, neg1
65+
#endif
66+
li.d TEMP, SIZE
67+
slli.d INCX, INCX, BASE_SHIFT
68+
srai.d I, N, 3
69+
bne INCX, TEMP, .L20
70+
bge $r0, I, .L13
71+
.align 3
72+
73+
.L11:
74+
#ifdef DOUBLE
75+
xvld VX0, X, 0 * SIZE
76+
xvld VX1, X, 4 * SIZE
77+
xvfmul.d VX2, neg1, VX0
78+
xvfmul.d VX3, neg1, VX1
79+
xvfcmp.clt.d VT0, VX0, res0
80+
xvfcmp.clt.d VT1, VX1, res0
81+
xvbitsel.v VX0, VX0, VX2, VT0
82+
xvbitsel.v VX1, VX1, VX3, VT1
83+
xvfadd.d res2, VX0, VX1
84+
xvfadd.d res1, res1, res2
85+
#else
86+
xvld VX0, X, 0 * SIZE
87+
xvfmul.s VX2, neg1, VX0
88+
xvfcmp.clt.s VT0, VX0, res0
89+
xvbitsel.v VX0, VX0, VX2, VT0
90+
xvfadd.s res1, VX0, res1
91+
#endif
92+
addi.d X, X, 8 * SIZE
93+
addi.d I, I, -1
94+
blt $r0, I, .L11
95+
.align 3
96+
97+
.L12:
98+
#ifdef DOUBLE
99+
xvpickve.d VX1, res1, 1
100+
xvpickve.d VX2, res1, 2
101+
xvpickve.d VX3, res1, 3
102+
xvfadd.d res1, VX1, res1
103+
xvfadd.d res1, VX2, res1
104+
xvfadd.d res1, VX3, res1
105+
#else
106+
xvfadd.s res2, res1, res2
107+
xvpickve.w VX1, res1, 1
108+
xvpickve.w VX2, res1, 2
109+
xvpickve.w VX3, res1, 3
110+
xvfadd.s res1, VX1, res1
111+
xvfadd.s res1, VX2, res1
112+
xvfadd.s res1, VX3, res1
113+
xvpickve.w VX0, res2, 4
114+
xvpickve.w VX1, res2, 5
115+
xvpickve.w VX2, res2, 6
116+
xvpickve.w VX3, res2, 7
117+
xvfadd.s res1, VX0, res1
118+
xvfadd.s res1, VX1, res1
119+
xvfadd.s res1, VX2, res1
120+
xvfadd.s res1, VX2, res1
121+
#endif
122+
.align 3
123+
124+
.L13:
125+
andi I, N, 7
126+
bge $r0, I, .L999
127+
.align 3
128+
129+
.L14:
130+
LD $f12, X, 0 * SIZE
131+
FABS $f12, $f12
132+
ADD $f16, $f12, $f16
133+
addi.d I, I, -1
134+
addi.d X, X, SIZE
135+
blt $r0, I, .L14
136+
b .L999
137+
.align 3
138+
139+
.L20:
140+
bge $r0, I, .L23
141+
.align 3
142+
143+
.L21:
144+
#ifdef DOUBLE
145+
ld.d t1, X, 0 * SIZE
146+
add.d X, X, INCX
147+
ld.d t2, X, 0 * SIZE
148+
add.d X, X, INCX
149+
ld.d t3, X, 0 * SIZE
150+
add.d X, X, INCX
151+
ld.d t4, X, 0 * SIZE
152+
add.d X, X, INCX
153+
xvinsgr2vr.d VX0, t1, 0
154+
xvinsgr2vr.d VX0, t2, 1
155+
xvinsgr2vr.d VX0, t3, 2
156+
xvinsgr2vr.d VX0, t4, 3
157+
ld.d t1, X, 0 * SIZE
158+
add.d X, X, INCX
159+
ld.d t2, X, 0 * SIZE
160+
add.d X, X, INCX
161+
ld.d t3, X, 0 * SIZE
162+
add.d X, X, INCX
163+
ld.d t4, X, 0 * SIZE
164+
add.d X, X, INCX
165+
xvinsgr2vr.d VX1, t1, 0
166+
xvinsgr2vr.d VX1, t2, 1
167+
xvinsgr2vr.d VX1, t3, 2
168+
xvinsgr2vr.d VX1, t4, 3
169+
xvfmul.d VX2, neg1, VX0
170+
xvfmul.d VX3, neg1, VX1
171+
xvfcmp.clt.d VT0, VX0, res0
172+
xvfcmp.clt.d VT1, VX1, res0
173+
xvbitsel.v VX0, VX0, VX2, VT0
174+
xvbitsel.v VX1, VX1, VX3, VT1
175+
xvfadd.d res2, VX0, VX1
176+
xvfadd.d res1, res1, res2
177+
#else
178+
ld.w t1, X, 0 * SIZE
179+
add.d X, X, INCX
180+
ld.w t2, X, 0 * SIZE
181+
add.d X, X, INCX
182+
ld.w t3, X, 0 * SIZE
183+
add.d X, X, INCX
184+
ld.w t4, X, 0 * SIZE
185+
add.d X, X, INCX
186+
xvinsgr2vr.w VX0, t1, 0
187+
xvinsgr2vr.w VX0, t2, 1
188+
xvinsgr2vr.w VX0, t3, 2
189+
xvinsgr2vr.w VX0, t4, 3
190+
ld.w t1, X, 0 * SIZE
191+
add.d X, X, INCX
192+
ld.w t2, X, 0 * SIZE
193+
add.d X, X, INCX
194+
ld.w t3, X, 0 * SIZE
195+
add.d X, X, INCX
196+
ld.w t4, X, 0 * SIZE
197+
add.d X, X, INCX
198+
xvinsgr2vr.w VX0, t1, 4
199+
xvinsgr2vr.w VX0, t2, 5
200+
xvinsgr2vr.w VX0, t3, 6
201+
xvinsgr2vr.w VX0, t4, 7
202+
xvfmul.s VX2, neg1, VX0
203+
xvfcmp.clt.s VT0, VX0, res0
204+
xvbitsel.v VX0, VX0, VX2, VT0
205+
xvfadd.s res1, VX0, res1
206+
#endif
207+
addi.d I, I, -1
208+
blt $r0, I, .L21
209+
.align 3
210+
211+
.L22:
212+
#ifdef DOUBLE
213+
xvpickve.d VX1, res1, 1
214+
xvpickve.d VX2, res1, 2
215+
xvpickve.d VX3, res1, 3
216+
xvfadd.d res1, VX1, res1
217+
xvfadd.d res1, VX2, res1
218+
xvfadd.d res1, VX3, res1
219+
#else
220+
xvfadd.s res2, res1, res2
221+
xvpickve.w VX1, res1, 1
222+
xvpickve.w VX2, res1, 2
223+
xvpickve.w VX3, res1, 3
224+
xvfadd.s res1, VX1, res1
225+
xvfadd.s res1, VX2, res1
226+
xvfadd.s res1, VX3, res1
227+
xvpickve.w VX0, res2, 4
228+
xvpickve.w VX1, res2, 5
229+
xvpickve.w VX2, res2, 6
230+
xvpickve.w VX3, res2, 7
231+
xvfadd.s res1, VX0, res1
232+
xvfadd.s res1, VX1, res1
233+
xvfadd.s res1, VX2, res1
234+
xvfadd.s res1, VX2, res1
235+
#endif
236+
.align 3
237+
238+
.L23:
239+
andi I, N, 7
240+
bge $r0, I, .L999
241+
.align 3
242+
243+
.L24:
244+
LD $f12, X, 0 * SIZE
245+
FABS $f12, $f12
246+
ADD $f16, $f12, $f16
247+
addi.d I, I, -1
248+
add.d X, X, INCX
249+
blt $r0, I, .L24
250+
.align 3
251+
252+
.L999:
253+
MOV $f0, $f16
254+
jirl $r0, $r1, 0x0
255+
.align 3
256+
257+
EPILOGUE

0 commit comments

Comments
 (0)