Skip to content

Commit 546f135

Browse files
XiWeiGuyinshiyou
authored andcommitted
loongarch64: Add {c/z}swap and {c/z}sum optimization
1 parent edabb93 commit 546f135

File tree

6 files changed

+1367
-0
lines changed

6 files changed

+1367
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,12 @@ DNRM2KERNEL = dnrm2_lsx.S
7575
CNRM2KERNEL = cnrm2_lsx.S
7676
ZNRM2KERNEL = znrm2_lsx.S
7777

78+
CSWAPKERNEL = cswap_lsx.S
79+
ZSWAPKERNEL = cswap_lsx.S
80+
81+
CSUMKERNEL = csum_lsx.S
82+
ZSUMKERNEL = csum_lsx.S
83+
7884
DGEMMKERNEL = dgemm_kernel_8x4.S
7985
DGEMMINCOPY = dgemm_ncopy_8_lsx.S
8086
DGEMMITCOPY = dgemm_tcopy_8_lsx.S

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,12 @@ DNRM2KERNEL = dnrm2_lasx.S
7575
CNRM2KERNEL = cnrm2_lasx.S
7676
ZNRM2KERNEL = znrm2_lasx.S
7777

78+
CSWAPKERNEL = cswap_lasx.S
79+
ZSWAPKERNEL = cswap_lasx.S
80+
81+
CSUMKERNEL = csum_lasx.S
82+
ZSUMKERNEL = csum_lasx.S
83+
7884
DGEMMKERNEL = dgemm_kernel_16x4.S
7985
DGEMMINCOPY = dgemm_ncopy_16.S
8086
DGEMMITCOPY = dgemm_tcopy_16.S

kernel/loongarch64/csum_lasx.S

Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
/*******************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*******************************************************************************/
27+
28+
#define ASSEMBLER
29+
#include "common.h"
30+
#define N $r4
31+
#define X $r5
32+
#define INCX $r6
33+
#define I $r17
34+
#define TEMP $r18
35+
#define t1 $r15
36+
#define t2 $r12
37+
#define t3 $r13
38+
#define t4 $r14
39+
#define a1 $f12
40+
#define a2 $f13
41+
#define a3 $f14
42+
#define a4 $f15
43+
#define s1 $f16
44+
#define VX0 $xr12
45+
#define VX1 $xr13
46+
#define VX2 $xr14
47+
#define VX3 $xr15
48+
#define res1 $xr16
49+
#define res2 $xr17
50+
PROLOGUE
51+
xvxor.v res1, res1, res1
52+
xvxor.v res2, res2, res2
53+
bge $r0, N, .L999
54+
bge $r0, INCX, .L999
55+
li.d TEMP, 1
56+
slli.d TEMP, TEMP, ZBASE_SHIFT
57+
slli.d INCX, INCX, ZBASE_SHIFT
58+
srai.d I, N, 3
59+
bne INCX, TEMP, .L20
60+
bge $r0, I, .L13
61+
.align 3
62+
63+
.L11:
64+
#ifdef DOUBLE
65+
xvld VX0, X, 0 * SIZE
66+
xvld VX1, X, 4 * SIZE
67+
xvfadd.d res2, VX0, VX1
68+
xvfadd.d res1, res1, res2
69+
xvld VX2, X, 8 * SIZE
70+
xvld VX3, X, 12 * SIZE
71+
xvfadd.d res2, VX2, VX3
72+
xvfadd.d res1, res1, res2
73+
#else
74+
xvld VX0, X, 0 * SIZE
75+
xvld VX1, X, 8 * SIZE
76+
xvfadd.s res2, VX0, VX1
77+
xvfadd.s res1, res2, res1
78+
#endif
79+
addi.d X, X, 16 * SIZE
80+
addi.d I, I, -1
81+
blt $r0, I, .L11
82+
.align 3
83+
84+
.L12:
85+
#ifdef DOUBLE
86+
xvpickve.d VX1, res1, 1
87+
xvpickve.d VX2, res1, 2
88+
xvpickve.d VX3, res1, 3
89+
xvfadd.d res1, VX1, res1
90+
xvfadd.d res1, VX2, res1
91+
xvfadd.d res1, VX3, res1
92+
#else
93+
xvfadd.s res2, res1, res2
94+
xvpickve.w VX1, res1, 1
95+
xvpickve.w VX2, res1, 2
96+
xvpickve.w VX3, res1, 3
97+
xvfadd.s res1, VX1, res1
98+
xvfadd.s res1, VX2, res1
99+
xvfadd.s res1, VX3, res1
100+
xvpickve.w VX0, res2, 4
101+
xvpickve.w VX1, res2, 5
102+
xvpickve.w VX2, res2, 6
103+
xvpickve.w VX3, res2, 7
104+
xvfadd.s res1, VX0, res1
105+
xvfadd.s res1, VX1, res1
106+
xvfadd.s res1, VX2, res1
107+
xvfadd.s res1, VX2, res1
108+
#endif
109+
.align 3
110+
111+
.L13:
112+
andi I, N, 7
113+
bge $r0, I, .L999
114+
.align 3
115+
116+
.L14:
117+
LD a1, X, 0 * SIZE
118+
LD a2, X, 1 * SIZE
119+
ADD a1, a1, a2
120+
ADD s1, a1, s1
121+
122+
addi.d I, I, -1
123+
addi.d X, X, 2 * SIZE
124+
blt $r0, I, .L14
125+
b .L999
126+
.align 3
127+
128+
.L20:
129+
bge $r0, I, .L23
130+
.align 3
131+
132+
.L21:
133+
#ifdef DOUBLE
134+
ld.d t1, X, 0 * SIZE
135+
ld.d t2, X, 1 * SIZE
136+
add.d X, X, INCX
137+
ld.d t3, X, 0 * SIZE
138+
ld.d t4, X, 1 * SIZE
139+
add.d X, X, INCX
140+
xvinsgr2vr.d VX0, t1, 0
141+
xvinsgr2vr.d VX0, t2, 1
142+
xvinsgr2vr.d VX0, t3, 2
143+
xvinsgr2vr.d VX0, t4, 3
144+
ld.d t1, X, 0 * SIZE
145+
ld.d t2, X, 1 * SIZE
146+
add.d X, X, INCX
147+
ld.d t3, X, 0 * SIZE
148+
ld.d t4, X, 1 * SIZE
149+
add.d X, X, INCX
150+
xvinsgr2vr.d VX1, t1, 0
151+
xvinsgr2vr.d VX1, t2, 1
152+
xvinsgr2vr.d VX1, t3, 2
153+
xvinsgr2vr.d VX1, t4, 3
154+
xvfadd.d res2, VX0, VX1
155+
xvfadd.d res1, res1, res2
156+
ld.d t1, X, 0 * SIZE
157+
ld.d t2, X, 1 * SIZE
158+
add.d X, X, INCX
159+
ld.d t3, X, 0 * SIZE
160+
ld.d t4, X, 1 * SIZE
161+
add.d X, X, INCX
162+
xvinsgr2vr.d VX0, t1, 0
163+
xvinsgr2vr.d VX0, t2, 1
164+
xvinsgr2vr.d VX0, t3, 2
165+
xvinsgr2vr.d VX0, t4, 3
166+
ld.d t1, X, 0 * SIZE
167+
ld.d t2, X, 1 * SIZE
168+
add.d X, X, INCX
169+
ld.d t3, X, 0 * SIZE
170+
ld.d t4, X, 1 * SIZE
171+
add.d X, X, INCX
172+
xvinsgr2vr.d VX1, t1, 0
173+
xvinsgr2vr.d VX1, t2, 1
174+
xvinsgr2vr.d VX1, t3, 2
175+
xvinsgr2vr.d VX1, t4, 3
176+
xvfadd.d res2, VX0, VX1
177+
xvfadd.d res1, res1, res2
178+
#else
179+
ld.w t1, X, 0 * SIZE
180+
ld.w t2, X, 1 * SIZE
181+
add.d X, X, INCX
182+
ld.w t3, X, 0 * SIZE
183+
ld.w t4, X, 1 * SIZE
184+
add.d X, X, INCX
185+
xvinsgr2vr.w VX0, t1, 0
186+
xvinsgr2vr.w VX0, t2, 1
187+
xvinsgr2vr.w VX0, t3, 2
188+
xvinsgr2vr.w VX0, t4, 3
189+
ld.w t1, X, 0 * SIZE
190+
ld.w t2, X, 1 * SIZE
191+
add.d X, X, INCX
192+
ld.w t3, X, 0 * SIZE
193+
ld.w t4, X, 1 * SIZE
194+
add.d X, X, INCX
195+
xvinsgr2vr.w VX0, t1, 4
196+
xvinsgr2vr.w VX0, t2, 5
197+
xvinsgr2vr.w VX0, t3, 6
198+
xvinsgr2vr.w VX0, t4, 7
199+
ld.w t1, X, 0 * SIZE
200+
ld.w t2, X, 1 * SIZE
201+
add.d X, X, INCX
202+
ld.w t3, X, 0 * SIZE
203+
ld.w t4, X, 1 * SIZE
204+
add.d X, X, INCX
205+
xvinsgr2vr.w VX1, t1, 0
206+
xvinsgr2vr.w VX1, t2, 1
207+
xvinsgr2vr.w VX1, t3, 2
208+
xvinsgr2vr.w VX1, t4, 3
209+
ld.w t1, X, 0 * SIZE
210+
ld.w t2, X, 1 * SIZE
211+
add.d X, X, INCX
212+
ld.w t3, X, 0 * SIZE
213+
ld.w t4, X, 1 * SIZE
214+
add.d X, X, INCX
215+
xvinsgr2vr.w VX1, t1, 4
216+
xvinsgr2vr.w VX1, t2, 5
217+
xvinsgr2vr.w VX1, t3, 6
218+
xvinsgr2vr.w VX1, t4, 7
219+
xvfadd.s res2, VX0, VX1
220+
xvfadd.s res1, res2, res1
221+
#endif
222+
addi.d I, I, -1
223+
blt $r0, I, .L21
224+
.align 3
225+
226+
.L22:
227+
#ifdef DOUBLE
228+
xvpickve.d VX1, res1, 1
229+
xvpickve.d VX2, res1, 2
230+
xvpickve.d VX3, res1, 3
231+
xvfadd.d res1, VX1, res1
232+
xvfadd.d res1, VX2, res1
233+
xvfadd.d res1, VX3, res1
234+
#else
235+
xvfadd.s res2, res1, res2
236+
xvpickve.w VX1, res1, 1
237+
xvpickve.w VX2, res1, 2
238+
xvpickve.w VX3, res1, 3
239+
xvfadd.s res1, VX1, res1
240+
xvfadd.s res1, VX2, res1
241+
xvfadd.s res1, VX3, res1
242+
xvpickve.w VX0, res2, 4
243+
xvpickve.w VX1, res2, 5
244+
xvpickve.w VX2, res2, 6
245+
xvpickve.w VX3, res2, 7
246+
xvfadd.s res1, VX0, res1
247+
xvfadd.s res1, VX1, res1
248+
xvfadd.s res1, VX2, res1
249+
xvfadd.s res1, VX2, res1
250+
#endif
251+
.align 3
252+
253+
.L23:
254+
andi I, N, 7
255+
bge $r0, I, .L999
256+
.align 3
257+
258+
.L24:
259+
LD a1, X, 0 * SIZE
260+
LD a2, X, 1 * SIZE
261+
ADD a1, a1, a2
262+
ADD s1, a1, s1
263+
264+
addi.d I, I, -1
265+
add.d X, X, INCX
266+
blt $r0, I, .L24
267+
.align 3
268+
269+
.L999:
270+
fmov.s $f0, $f16
271+
jirl $r0, $r1, 0x0
272+
.align 3
273+
274+
EPILOGUE

0 commit comments

Comments
 (0)