Skip to content

Commit 173a65d

Browse files
CNClareChenyinshiyou
authored andcommitted
loongarch64: Add and refine iamax optimization functions.
1 parent ea70e16 commit 173a65d

File tree

12 files changed

+2101
-866
lines changed

12 files changed

+2101
-866
lines changed

common_loongarch64.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ static inline int WhereAmI(void){
139139
#define XVFMAX xvfmax.d
140140
#define XVFMAXA xvfmaxa.d
141141
#define XVCMPEQ xvfcmp.ceq.d
142+
#define XVCMPLE xvfcmp.cle.d
142143
#define XVCMPLT xvfcmp.clt.d
143144
#define XVMUL xvfmul.d
144145
#define XVMSUB xvfmsub.d
@@ -151,6 +152,7 @@ static inline int WhereAmI(void){
151152
#define VFMAX vfmax.d
152153
#define VFMAXA vfmaxa.d
153154
#define VCMPEQ vfcmp.ceq.d
155+
#define VCMPLE vfcmp.cle.d
154156
#define VCMPLT vfcmp.clt.d
155157
#define VMUL vfmul.d
156158
#define VMSUB vfmsub.d
@@ -189,6 +191,7 @@ static inline int WhereAmI(void){
189191
#define XVFMAX xvfmax.s
190192
#define XVFMAXA xvfmaxa.s
191193
#define XVCMPEQ xvfcmp.ceq.s
194+
#define XVCMPLE xvfcmp.cle.s
192195
#define XVCMPLT xvfcmp.clt.s
193196
#define XVMUL xvfmul.s
194197
#define XVMSUB xvfmsub.s
@@ -201,6 +204,7 @@ static inline int WhereAmI(void){
201204
#define VFMAX vfmax.s
202205
#define VFMAXA vfmaxa.s
203206
#define VCMPEQ vfcmp.ceq.s
207+
#define VCMPLE vfcmp.cle.s
204208
#define VCMPLT vfcmp.clt.s
205209
#define VMUL vfmul.s
206210
#define VMSUB vfmsub.s

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ DSCALKERNEL = dscal_lsx.S
99

1010
SAMAXKERNEL = amax_lsx.S
1111
DAMAXKERNEL = amax_lsx.S
12+
CAMAXKERNEL = camax_lsx.S
1213

1314
SAMINKERNEL = amin_lsx.S
1415
DAMINKERNEL = amin_lsx.S
@@ -25,8 +26,10 @@ IDMAXKERNEL = imax_lsx.S
2526
ISMINKERNEL = imin_lsx.S
2627
IDMINKERNEL = imin_lsx.S
2728

28-
ISAMAXKERNEL = isamax_lsx.S
29-
IDAMAXKERNEL = idamax_lsx.S
29+
ISAMAXKERNEL = iamax_lsx.S
30+
IDAMAXKERNEL = iamax_lsx.S
31+
ICAMAXKERNEL = icamax_lsx.S
32+
IZAMAXKERNEL = icamax_lsx.S
3033

3134
ISAMINKERNEL = iamin_lsx.S
3235
IDAMINKERNEL = iamin_lsx.S

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ DSCALKERNEL = dscal_lasx.S
99

1010
SAMAXKERNEL = amax_lasx.S
1111
DAMAXKERNEL = amax_lasx.S
12+
CAMAXKERNEL = camax_lasx.S
1213

1314
SAMINKERNEL = amin_lasx.S
1415
DAMINKERNEL = amin_lasx.S
@@ -25,8 +26,10 @@ IDMAXKERNEL = imax_lasx.S
2526
ISMINKERNEL = imin_lasx.S
2627
IDMINKERNEL = imin_lasx.S
2728

28-
ISAMAXKERNEL = isamax_lasx.S
29-
IDAMAXKERNEL = idamax_lasx.S
29+
ISAMAXKERNEL = iamax_lasx.S
30+
IDAMAXKERNEL = iamax_lasx.S
31+
ICAMAXKERNEL = icamax_lasx.S
32+
IZAMAXKERNEL = icamax_lasx.S
3033

3134
ISAMINKERNEL = iamin_lasx.S
3235
IDAMINKERNEL = iamin_lasx.S

kernel/loongarch64/camax_lasx.S

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
/***************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
30+
#include "common.h"
31+
32+
#define N $r4
33+
#define X $r5
34+
#define INCX $r6
35+
#define I $r12
36+
#define t1 $f14
37+
#define t2 $f18
38+
#define t3 $f15
39+
#define t4 $f17
40+
#define s1 $f22
41+
#define s2 $f9
42+
#define s3 $f10
43+
#define s4 $f11
44+
#define TEMP $r16
45+
#define a0 $f20
46+
#define a1 $f21
47+
#define x1 $xr9
48+
#define x2 $xr10
49+
#define x3 $xr11
50+
#define x4 $xr12
51+
#define VT0 $xr13
52+
#define VT1 $xr14
53+
#define res0 $xr18
54+
#define neg1 $xr19
55+
#define VX0 $xr20
56+
#define VX1 $xr21
57+
#define VM0 $xr22
58+
#define VM1 $xr23
59+
60+
PROLOGUE
61+
xvxor.v VM0, VM0, VM0
62+
xvxor.v res0, res0, res0
63+
bge $r0, N, .L999
64+
bge $r0, INCX, .L999
65+
li.d TEMP, 1
66+
li.w I, -1
67+
slli.d TEMP, TEMP, ZBASE_SHIFT
68+
slli.d INCX, INCX, ZBASE_SHIFT
69+
xvreplgr2vr.w neg1, I
70+
xvffint.s.w neg1, neg1
71+
srai.d I, N, 3
72+
bne INCX, TEMP, .L20
73+
bge $r0, I, .L23
74+
.align 3
75+
76+
.L10:
77+
xvld VX0, X, 0 * SIZE
78+
xvld VX1, X, 8 * SIZE
79+
addi.d I, I, -1
80+
xvpickev.w x1, VX1, VX0
81+
xvpickod.w x2, VX1, VX0
82+
xvfmul.s x3, neg1, x1
83+
xvfmul.s x4, neg1, x2
84+
xvfcmp.clt.s VT0, x1, res0
85+
xvfcmp.clt.s VT1, x2, res0
86+
xvbitsel.v x1, x1, x3, VT0
87+
xvbitsel.v x2, x2, x4, VT1
88+
addi.d X, X, 16 * SIZE
89+
xvfadd.s VM1, x1, x2
90+
xvfmax.s VM0, VM0, VM1
91+
blt $r0, I, .L10
92+
.align 3
93+
94+
.L11:
95+
xvpickve.w x1, VM0, 0
96+
xvpickve.w x2, VM0, 1
97+
xvpickve.w x3, VM0, 2
98+
xvpickve.w x4, VM0, 3
99+
xvfmax.s VM1, x1, x2
100+
xvfmax.s VM0, x3, x4
101+
xvfmax.s VM0, VM0, VM1
102+
b .L23
103+
.align 3
104+
105+
.L20: // INCX!=1
106+
bge $r0, I, .L23
107+
.align 3
108+
109+
.L21:
110+
fld.s t1, X, 0 * SIZE
111+
fld.s t2, X, 1 * SIZE
112+
add.d X, X, INCX
113+
fld.s t3, X, 0 * SIZE
114+
fld.s t4, X, 1 * SIZE
115+
add.d X, X, INCX
116+
fabs.s t1, t1
117+
fabs.s t2, t2
118+
fabs.s t3, t3
119+
fabs.s t4, t4
120+
fadd.s t1, t1, t2
121+
fadd.s t3, t3, t4
122+
fmax.s s1, t1, t3
123+
fld.s t1, X, 0 * SIZE
124+
fld.s t2, X, 1 * SIZE
125+
add.d X, X, INCX
126+
fld.s t3, X, 0 * SIZE
127+
fld.s t4, X, 1 * SIZE
128+
add.d X, X, INCX
129+
fabs.s t1, t1
130+
fabs.s t2, t2
131+
fabs.s t3, t3
132+
fabs.s t4, t4
133+
fadd.s t1, t1, t2
134+
fadd.s t3, t3, t4
135+
fmax.s s1, t1, t3
136+
fld.s t1, X, 0 * SIZE
137+
fld.s t2, X, 1 * SIZE
138+
add.d X, X, INCX
139+
fld.s t3, X, 0 * SIZE
140+
fld.s t4, X, 1 * SIZE
141+
add.d X, X, INCX
142+
fabs.s t1, t1
143+
fabs.s t2, t2
144+
fabs.s t3, t3
145+
fabs.s t4, t4
146+
addi.d I, I, -1
147+
fadd.s t1, t1, t2
148+
fadd.s t3, t3, t4
149+
fmax.s s3, t1, t3
150+
fld.s t1, X, 0 * SIZE
151+
fld.s t2, X, 1 * SIZE
152+
add.d X, X, INCX
153+
fld.s t3, X, 0 * SIZE
154+
fld.s t4, X, 1 * SIZE
155+
add.d X, X, INCX
156+
fabs.s t1, t1
157+
fabs.s t2, t2
158+
fabs.s t3, t3
159+
fabs.s t4, t4
160+
fadd.s t1, t1, t2
161+
fadd.s t3, t3, t4
162+
fmax.s s4, t1, t3
163+
blt $r0, I, .L21
164+
.align 3
165+
166+
.L22:
167+
fmax.s s1, s1, s2
168+
fmax.s s3, s3, s4
169+
fmax.s s1, s1, s3
170+
.align 3
171+
172+
.L23: //N<8
173+
andi I, N, 7
174+
bge $r0, I, .L999
175+
.align 3
176+
177+
.L24:
178+
LD a0, X, 0 * SIZE
179+
LD a1, X, 1 * SIZE
180+
addi.d I, I, -1
181+
FABS a0, a0
182+
FABS a1, a1
183+
ADD a0, a0, a1
184+
add.d X, X, INCX
185+
fmax.s s1, a0, s1
186+
blt $r0, I, .L24
187+
.align 3
188+
189+
.L999:
190+
fmov.s $f0, $f22
191+
jirl $r0, $r1, 0x0
192+
.align 3
193+
194+
EPILOGUE

0 commit comments

Comments
 (0)