Skip to content

Commit 8785e94

Browse files
CNClareChenyinshiyou
authored andcommitted
loongarch64: Add camin optimization function.
1 parent 0753848 commit 8785e94

File tree

4 files changed

+412
-0
lines changed

4 files changed

+412
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ CAMAXKERNEL = camax_lsx.S
1313

1414
SAMINKERNEL = amin_lsx.S
1515
DAMINKERNEL = amin_lsx.S
16+
CAMINKERNEL = camin_lsx.S
1617

1718
SMAXKERNEL = max_lsx.S
1819
DMAXKERNEL = max_lsx.S

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ CAMAXKERNEL = camax_lasx.S
1313

1414
SAMINKERNEL = amin_lasx.S
1515
DAMINKERNEL = amin_lasx.S
16+
CAMINKERNEL = camin_lasx.S
1617

1718
SMAXKERNEL = max_lsx.S
1819
DMAXKERNEL = max_lsx.S

kernel/loongarch64/camin_lasx.S

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
/***************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
#include "common.h"
30+
31+
#define N $r4
32+
#define X $r5
33+
#define INCX $r6
34+
#define I $r12
35+
#define TEMP $r16
36+
#define t1 $f14
37+
#define t2 $f18
38+
#define t3 $f15
39+
#define t4 $f17
40+
#define s1 $f22
41+
#define s2 $f9
42+
#define s3 $f10
43+
#define s4 $f11
44+
#define a0 $f20
45+
#define a1 $f21
46+
#define x1 $xr9
47+
#define x2 $xr10
48+
#define x3 $xr11
49+
#define x4 $xr12
50+
#define VT0 $xr13
51+
#define VT1 $xr14
52+
#define res0 $xr18
53+
#define neg1 $xr19
54+
#define VX0 $xr20
55+
#define VX1 $xr21
56+
#define VM0 $xr22
57+
#define VM1 $xr23
58+
59+
PROLOGUE
60+
MTC s1, $r0
61+
xvxor.v res0, res0, res0
62+
bge $r0, N, .L999
63+
bge $r0, INCX, .L999
64+
fld.s a0, X, 0 * SIZE
65+
fld.s a1, X, 1 * SIZE
66+
fabs.s a0, a0
67+
fabs.s a1, a1
68+
fadd.s s1, a1, a0
69+
xvreplve0.w VM0, VM0
70+
li.d TEMP, 1
71+
li.w I, -1
72+
slli.d TEMP, TEMP, ZBASE_SHIFT
73+
slli.d INCX, INCX, ZBASE_SHIFT
74+
xvreplgr2vr.w neg1, I
75+
xvffint.s.w neg1, neg1
76+
srai.d I, N, 3
77+
bne INCX, TEMP, .L20
78+
bge $r0, I, .L23
79+
.align 3
80+
81+
.L10:
82+
xvld VX0, X, 0 * SIZE
83+
xvld VX1, X, 8 * SIZE
84+
addi.d I, I, -1
85+
xvpickev.w x1, VX1, VX0
86+
xvpickod.w x2, VX1, VX0
87+
xvfmul.s x3, neg1, x1
88+
xvfmul.s x4, neg1, x2
89+
xvfcmp.clt.s VT0, x1, res0
90+
xvfcmp.clt.s VT1, x2, res0
91+
xvbitsel.v x1, x1, x3, VT0
92+
xvbitsel.v x2, x2, x4, VT1
93+
addi.d X, X, 16 * SIZE
94+
xvfadd.s VM1, x1, x2
95+
xvfmin.s VM0, VM0, VM1
96+
blt $r0, I, .L10
97+
.align 3
98+
99+
.L11:
100+
xvpickve.w x1, VM0, 0
101+
xvpickve.w x2, VM0, 1
102+
xvpickve.w x3, VM0, 2
103+
xvpickve.w x4, VM0, 3
104+
xvfmin.s VM1, x1, x2
105+
xvfmin.s VM0, x3, x4
106+
xvfmin.s VM0, VM0, VM1
107+
b .L23
108+
.align 3
109+
110+
.L20: // INCX!=1
111+
bge $r0, I, .L23
112+
.align 3
113+
114+
.L21:
115+
fld.s t1, X, 0 * SIZE
116+
fld.s t2, X, 1 * SIZE
117+
add.d X, X, INCX
118+
fld.s t3, X, 0 * SIZE
119+
fld.s t4, X, 1 * SIZE
120+
add.d X, X, INCX
121+
fabs.s t1, t1
122+
fabs.s t2, t2
123+
fabs.s t3, t3
124+
fabs.s t4, t4
125+
fadd.s t1, t1, t2
126+
fadd.s t3, t3, t4
127+
fmin.s s1, t1, t3
128+
fld.s t1, X, 0 * SIZE
129+
fld.s t2, X, 1 * SIZE
130+
add.d X, X, INCX
131+
fld.s t3, X, 0 * SIZE
132+
fld.s t4, X, 1 * SIZE
133+
add.d X, X, INCX
134+
fabs.s t1, t1
135+
fabs.s t2, t2
136+
fabs.s t3, t3
137+
fabs.s t4, t4
138+
fadd.s t1, t1, t2
139+
fadd.s t3, t3, t4
140+
fmin.s s1, t1, t3
141+
fld.s t1, X, 0 * SIZE
142+
fld.s t2, X, 1 * SIZE
143+
add.d X, X, INCX
144+
fld.s t3, X, 0 * SIZE
145+
fld.s t4, X, 1 * SIZE
146+
add.d X, X, INCX
147+
fabs.s t1, t1
148+
fabs.s t2, t2
149+
fabs.s t3, t3
150+
fabs.s t4, t4
151+
addi.d I, I, -1
152+
fadd.s t1, t1, t2
153+
fadd.s t3, t3, t4
154+
fmin.s s3, t1, t3
155+
fld.s t1, X, 0 * SIZE
156+
fld.s t2, X, 1 * SIZE
157+
add.d X, X, INCX
158+
fld.s t3, X, 0 * SIZE
159+
fld.s t4, X, 1 * SIZE
160+
add.d X, X, INCX
161+
fabs.s t1, t1
162+
fabs.s t2, t2
163+
fabs.s t3, t3
164+
fabs.s t4, t4
165+
fadd.s t1, t1, t2
166+
fadd.s t3, t3, t4
167+
fmin.s s4, t1, t3
168+
blt $r0, I, .L21
169+
.align 3
170+
171+
.L22:
172+
fmin.s s1, s1, s2
173+
fmin.s s3, s3, s4
174+
fmin.s s1, s1, s3
175+
.align 3
176+
177+
.L23: //N<8
178+
andi I, N, 7
179+
bge $r0, I, .L999
180+
.align 3
181+
182+
.L24:
183+
LD a0, X, 0 * SIZE
184+
LD a1, X, 1 * SIZE
185+
addi.d I, I, -1
186+
FABS a0, a0
187+
FABS a1, a1
188+
ADD a0, a0, a1
189+
add.d X, X, INCX
190+
fmin.s s1, a0, s1
191+
blt $r0, I, .L24
192+
.align 3
193+
194+
.L999:
195+
fmov.s $f0, $f22
196+
jirl $r0, $r1, 0x0
197+
.align 3
198+
199+
EPILOGUE

0 commit comments

Comments
 (0)