Skip to content

Commit 3c53ded

Browse files
CNClareChenyinshiyou
authored andcommitted
loongarch64: Add c/znrm2 optimization functions.
1 parent fbd612f commit 3c53ded

File tree

6 files changed

+818
-0
lines changed

6 files changed

+818
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ DROTKERNEL = rot_lsx.S
7070

7171
SNRM2KERNEL = snrm2_lsx.S
7272
DNRM2KERNEL = dnrm2_lsx.S
73+
CNRM2KERNEL = cnrm2_lsx.S
74+
ZNRM2KERNEL = znrm2_lsx.S
7375

7476
DGEMMKERNEL = dgemm_kernel_8x4.S
7577
DGEMMINCOPY = dgemm_ncopy_8_lsx.S

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ DROTKERNEL = rot_lasx.S
7070

7171
SNRM2KERNEL = snrm2_lasx.S
7272
DNRM2KERNEL = dnrm2_lasx.S
73+
CNRM2KERNEL = cnrm2_lasx.S
74+
ZNRM2KERNEL = znrm2_lasx.S
7375

7476
DGEMMKERNEL = dgemm_kernel_16x4.S
7577
DGEMMINCOPY = dgemm_ncopy_16.S

kernel/loongarch64/cnrm2_lasx.S

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
/***************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
#include "common.h"
30+
31+
#define N $r4
32+
#define X $r5
33+
#define INCX $r6
34+
#define I $r17
35+
#define TEMP $r18
36+
#define t1 $r12
37+
#define t2 $r13
38+
#define t3 $r14
39+
#define t4 $r15
40+
#define a1 $f15
41+
#define a2 $f16
42+
#define res $f19
43+
#define VX0 $xr15
44+
#define VX1 $xr16
45+
#define VX2 $xr17
46+
#define VX3 $xr18
47+
#define VX4 $xr21
48+
#define res1 $xr19
49+
#define res2 $xr20
50+
51+
PROLOGUE
52+
53+
#ifdef F_INTERFACE
54+
LDINT N, 0(N)
55+
LDINT INCX, 0(INCX)
56+
#endif
57+
58+
xvxor.v res1, res1, res1
59+
xvxor.v res2, res2, res2
60+
bge $r0, N, .L999
61+
beq $r0, INCX, .L999
62+
li.d TEMP, SIZE
63+
slli.d INCX, INCX, ZBASE_SHIFT
64+
srai.d I, N, 2
65+
bne INCX, TEMP, .L20
66+
bge $r0, I, .L997
67+
.align 3
68+
69+
.L10:
70+
xvld VX0, X, 0 * SIZE
71+
xvfcvtl.d.s VX1, VX0
72+
xvfcvth.d.s VX2, VX0
73+
xvfmadd.d res1, VX1, VX1, res1
74+
xvfmadd.d res2, VX2, VX2, res2
75+
addi.d I, I, -1
76+
addi.d X, X, 8 * SIZE
77+
blt $r0, I, .L10
78+
.align 3
79+
b .L996
80+
81+
.L20:
82+
bge $r0, I, .L997
83+
.align 3
84+
85+
.L21:
86+
ld.w t1, X, 0 * SIZE
87+
ld.w t2, X, 1 * SIZE
88+
add.d X, X, INCX
89+
ld.w t3, X, 0 * SIZE
90+
ld.w t4, X, 1 * SIZE
91+
add.d X, X, INCX
92+
xvinsgr2vr.w VX0, t1, 0
93+
xvinsgr2vr.w VX0, t2, 1
94+
xvinsgr2vr.w VX0, t3, 2
95+
xvinsgr2vr.w VX0, t4, 3
96+
ld.w t1, X, 0 * SIZE
97+
ld.w t2, X, 1 * SIZE
98+
add.d X, X, INCX
99+
ld.w t3, X, 0 * SIZE
100+
ld.w t4, X, 1 * SIZE
101+
xvinsgr2vr.w VX0, t1, 4
102+
xvinsgr2vr.w VX0, t2, 5
103+
xvinsgr2vr.w VX0, t3, 6
104+
xvinsgr2vr.w VX0, t4, 7
105+
add.d X, X, INCX
106+
xvfcvtl.d.s VX1, VX0
107+
xvfcvth.d.s VX2, VX0
108+
xvfmadd.d res1, VX1, VX1, res1
109+
xvfmadd.d res2, VX2, VX2, res2
110+
addi.d I, I, -1
111+
blt $r0, I, .L21
112+
b .L996
113+
114+
.L996:
115+
xvfadd.d res1, res1, res2
116+
xvpickve.d VX1, res1, 1
117+
xvpickve.d VX2, res1, 2
118+
xvpickve.d VX3, res1, 3
119+
xvfadd.d res1, VX1, res1
120+
xvfadd.d res1, VX2, res1
121+
xvfadd.d res1, VX3, res1
122+
.align 3
123+
124+
.L997:
125+
andi I, N, 3
126+
bge $r0, I, .L999
127+
.align 3
128+
129+
.L998:
130+
fld.s a1, X, 0 * SIZE
131+
fld.s a2, X, 1 * SIZE
132+
addi.d I, I, -1
133+
fcvt.d.s a1, a1
134+
fcvt.d.s a2, a2
135+
fmadd.d res, a1, a1, res
136+
fmadd.d res, a2, a2, res
137+
add.d X, X, INCX
138+
blt $r0, I, .L998
139+
.align 3
140+
141+
.L999:
142+
fsqrt.d res, res
143+
move $r4, $r17
144+
fcvt.s.d $f0, res
145+
jirl $r0, $r1, 0x0
146+
147+
EPILOGUE

kernel/loongarch64/cnrm2_lsx.S

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
/***************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
#include "common.h"
30+
31+
#define N $r4
32+
#define X $r5
33+
#define INCX $r6
34+
#define I $r17
35+
#define TEMP $r18
36+
#define t1 $r12
37+
#define t2 $r13
38+
#define t3 $r14
39+
#define t4 $r15
40+
#define a1 $f15
41+
#define a2 $f16
42+
#define res $f19
43+
#define VX0 $vr15
44+
#define VX1 $vr16
45+
#define VX2 $vr17
46+
#define VX3 $vr18
47+
#define VX4 $vr21
48+
#define res1 $vr19
49+
#define res2 $vr20
50+
51+
PROLOGUE
52+
53+
#ifdef F_INTERFACE
54+
LDINT N, 0(N)
55+
LDINT INCX, 0(INCX)
56+
#endif
57+
58+
vxor.v res1, res1, res1
59+
vxor.v res2, res2, res2
60+
bge $r0, N, .L999
61+
beq $r0, INCX, .L999
62+
li.d TEMP, 1
63+
slli.d TEMP, TEMP, ZBASE_SHIFT
64+
slli.d INCX, INCX, ZBASE_SHIFT
65+
srai.d I, N, 2
66+
bne INCX, TEMP, .L20
67+
bge $r0, I, .L997
68+
.align 3
69+
70+
.L10:
71+
vld VX0, X, 0 * SIZE
72+
vfcvtl.d.s VX1, VX0
73+
vfcvth.d.s VX2, VX0
74+
vfmadd.d res1, VX1, VX1, res1
75+
vfmadd.d res2, VX2, VX2, res2
76+
vld VX0, X, 4 * SIZE
77+
vfcvtl.d.s VX3, VX0
78+
vfcvth.d.s VX4, VX0
79+
vfmadd.d res1, VX3, VX3, res1
80+
vfmadd.d res2, VX4, VX4, res2
81+
addi.d I, I, -1
82+
addi.d X, X, 8 * SIZE
83+
blt $r0, I, .L10
84+
b .L996
85+
.align 3
86+
87+
.L20:
88+
bge $r0, I, .L997
89+
.align 3
90+
91+
.L21:
92+
ld.w t1, X, 0 * SIZE
93+
ld.w t2, X, 1 * SIZE
94+
add.d X, X, INCX
95+
ld.w t3, X, 0 * SIZE
96+
ld.w t4, X, 1 * SIZE
97+
vinsgr2vr.w VX0, t1, 0
98+
vinsgr2vr.w VX0, t2, 1
99+
vinsgr2vr.w VX0, t3, 2
100+
vinsgr2vr.w VX0, t4, 3
101+
add.d X, X, INCX
102+
vfcvtl.d.s VX1, VX0
103+
vfcvth.d.s VX2, VX0
104+
vfmadd.d res1, VX1, VX1, res1
105+
vfmadd.d res2, VX2, VX2, res2
106+
ld.w t1, X, 0 * SIZE
107+
ld.w t2, X, 1 * SIZE
108+
add.d X, X, INCX
109+
ld.w t3, X, 0 * SIZE
110+
ld.w t4, X, 1 * SIZE
111+
vinsgr2vr.w VX0, t1, 0
112+
vinsgr2vr.w VX0, t2, 1
113+
vinsgr2vr.w VX0, t3, 2
114+
vinsgr2vr.w VX0, t4, 3
115+
add.d X, X, INCX
116+
vfcvtl.d.s VX3, VX0
117+
vfcvth.d.s VX4, VX0
118+
vfmadd.d res1, VX3, VX3, res1
119+
vfmadd.d res2, VX4, VX4, res2
120+
addi.d I, I, -1
121+
blt $r0, I, .L21
122+
b .L996
123+
.align 3
124+
125+
.L996:
126+
vfadd.d res1, res1, res2
127+
vreplvei.d VX1, res1, 1
128+
vfadd.d res1, VX1, res1
129+
.align 3
130+
131+
.L997:
132+
andi I, N, 3
133+
bge $r0, I, .L999
134+
.align 3
135+
136+
.L998:
137+
fld.s a1, X, 0 * SIZE
138+
fld.s a2, X, 1 * SIZE
139+
addi.d I, I, -1
140+
fcvt.d.s a1, a1
141+
fcvt.d.s a2, a2
142+
fmadd.d res, a1, a1, res
143+
fmadd.d res, a2, a2, res
144+
add.d X, X, INCX
145+
blt $r0, I, .L998
146+
.align 3
147+
148+
.L999:
149+
fsqrt.d res, res
150+
move $r4, $r17
151+
fcvt.s.d $f0, $f19
152+
jirl $r0, $r1, 0x0
153+
.align 3
154+
155+
EPILOGUE

0 commit comments

Comments
 (0)