Skip to content

Commit 0753848

Browse files
CNClareChenyinshiyou
authored andcommitted
loongarch64: Refine and add axpy optimization functions.
Signed-off-by: Hao Chen <[email protected]>
1 parent 06fd5b5 commit 0753848

File tree

8 files changed

+1839
-711
lines changed

8 files changed

+1839
-711
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,10 @@ DCOPYKERNEL = copy_lsx.S
4040
SSWAPKERNEL = swap_lsx.S
4141
DSWAPKERNEL = swap_lsx.S
4242

43-
SAXPYKERNEL = saxpy_lsx.S
44-
DAXPYKERNEL = daxpy_lsx.S
43+
SAXPYKERNEL = axpy_lsx.S
44+
DAXPYKERNEL = axpy_lsx.S
45+
CAXPYKERNEL = caxpy_lsx.S
46+
ZAXPYKERNEL = caxpy_lsx.S
4547

4648
SAXPBYKERNEL = saxpby_lsx.S
4749
DAXPBYKERNEL = daxpby_lsx.S

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,10 @@ DCOPYKERNEL = copy_lasx.S
4040
SSWAPKERNEL = swap_lasx.S
4141
DSWAPKERNEL = swap_lasx.S
4242

43-
SAXPYKERNEL = saxpy_lasx.S
44-
DAXPYKERNEL = daxpy_lasx.S
43+
SAXPYKERNEL = axpy_lasx.S
44+
DAXPYKERNEL = axpy_lasx.S
45+
CAXPYKERNEL = caxpy_lasx.S
46+
ZAXPYKERNEL = caxpy_lasx.S
4547

4648
SAXPBYKERNEL = saxpby_lasx.S
4749
DAXPBYKERNEL = daxpby_lasx.S

kernel/loongarch64/daxpy_lasx.S renamed to kernel/loongarch64/axpy_lasx.S

Lines changed: 214 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,33 @@
1-
#define ASSEMBLER
1+
/***************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
227

28+
#define ASSEMBLER
329
#include "common.h"
30+
431
#define N $r4
532
#define XX $r5
633
#define YY $r6
@@ -35,16 +62,20 @@
3562
bge $r0, N, .L999
3663
li.d TEMP, 1
3764
movgr2fr.d a1, $r0
38-
ffint.d.l a1, a1
65+
FFINT a1, a1
3966
movgr2fr.d a2, TEMP
40-
ffint.d.l a2, a2
41-
fcmp.ceq.d $fcc0, ALPHA, a1
67+
FFINT a2, a2
68+
CMPEQ $fcc0, ALPHA, a1
4269
bcnez $fcc0, .L999
4370
slli.d TEMP, TEMP, BASE_SHIFT
4471
slli.d INCX, INCX, BASE_SHIFT
4572
slli.d INCY, INCY, BASE_SHIFT
46-
movfr2gr.d t1, ALPHA
73+
MTG t1, ALPHA
74+
#ifdef DOUBLE
4775
xvreplgr2vr.d VXA, t1
76+
#else
77+
xvreplgr2vr.w VXA, t1
78+
#endif
4879

4980
srai.d I, N, 3
5081
bne INCX, TEMP, .L20
@@ -56,11 +87,12 @@
5687

5788
.L11:
5889
bge $r0, I, .L113
59-
fcmp.ceq.d $fcc0, ALPHA, a2
90+
CMPEQ $fcc0, ALPHA, a2
6091
bceqz $fcc0, .L112
6192
.align 3
6293

6394
.L111:
95+
#ifdef DOUBLE
6496
xvld VX0, X, 0 * SIZE
6597
xvld VX2, Y, 0 * SIZE
6698
xvld VX1, X, 4 * SIZE
@@ -70,13 +102,21 @@
70102
addi.d I, I, -1
71103
xvst VX2, Y, 0 * SIZE
72104
xvst VX3, Y, 4 * SIZE
105+
#else
106+
xvld VX0, X, 0 * SIZE
107+
xvld VX2, Y, 0 * SIZE
108+
addi.d I, I, -1
109+
xvfadd.s VX2, VX0, VX2
110+
xvst VX2, Y, 0 * SIZE
111+
#endif
73112
addi.d X, X, 8 * SIZE
74113
addi.d Y, Y, 8 * SIZE
75114
blt $r0, I, .L111
76115
b .L113
77116
.align 3
78117

79118
.L112:
119+
#ifdef DOUBLE
80120
xvld VX0, X, 0 * SIZE
81121
xvld VX2, Y, 0 * SIZE
82122
xvld VX1, X, 4 * SIZE
@@ -86,6 +126,13 @@
86126
addi.d I, I, -1
87127
xvst VX2, Y, 0 * SIZE
88128
xvst VX3, Y, 4 * SIZE
129+
#else
130+
xvld VX0, X, 0 * SIZE
131+
xvld VX2, Y, 0 * SIZE
132+
addi.d I, I, -1
133+
xvfmadd.s VX2, VX0, VXA, VX2
134+
xvst VX2, Y, 0 * SIZE
135+
#endif
89136
addi.d X, X, 8 * SIZE
90137
addi.d Y, Y, 8 * SIZE
91138
blt $r0, I, .L112
@@ -97,11 +144,11 @@
97144
.align 3
98145

99146
.L114:
100-
fld.d $f12, X, 0 * SIZE
101-
fld.d $f14, Y, 0 * SIZE
147+
LD $f12, X, 0 * SIZE
148+
LD $f14, Y, 0 * SIZE
102149
addi.d I, I, -1
103-
fmadd.d $f14, $f12, $f0, $f14
104-
fst.d $f14, Y, 0 * SIZE
150+
MADD $f14, $f12, $f0, $f14
151+
ST $f14, Y, 0 * SIZE
105152
addi.d X, X, SIZE
106153
addi.d Y, Y, SIZE
107154
blt $r0, I, .L114
@@ -114,6 +161,7 @@
114161
.align 3
115162

116163
.L121:
164+
#ifdef DOUBLE
117165
xvld VX0, X, 0 * SIZE
118166
ld.d t1, Y, 0 * SIZE
119167
add.d Y, Y, INCY
@@ -158,6 +206,50 @@
158206
xvstelm.d VX3, YY, 0, 2
159207
add.d YY, YY, INCY
160208
xvstelm.d VX3, YY, 0, 3
209+
#else
210+
xvld VX0, X, 0 * SIZE
211+
ld.w t1, Y, 0 * SIZE
212+
add.d Y, Y, INCY
213+
ld.w t2, Y, 0 * SIZE
214+
add.d Y, Y, INCY
215+
ld.w t3, Y, 0 * SIZE
216+
add.d Y, Y, INCY
217+
ld.w t4, Y, 0 * SIZE
218+
xvinsgr2vr.w VX2, t1, 0
219+
xvinsgr2vr.w VX2, t2, 1
220+
xvinsgr2vr.w VX2, t3, 2
221+
xvinsgr2vr.w VX2, t4, 3
222+
add.d Y, Y, INCY
223+
ld.w t1, Y, 0 * SIZE
224+
add.d Y, Y, INCY
225+
ld.w t2, Y, 0 * SIZE
226+
add.d Y, Y, INCY
227+
ld.w t3, Y, 0 * SIZE
228+
add.d Y, Y, INCY
229+
ld.w t4, Y, 0 * SIZE
230+
xvinsgr2vr.w VX2, t1, 4
231+
xvinsgr2vr.w VX2, t2, 5
232+
xvinsgr2vr.w VX2, t3, 6
233+
xvinsgr2vr.w VX2, t4, 7
234+
add.d Y, Y, INCY
235+
xvfmadd.s VX2, VX0, VXA, VX2
236+
addi.d I, I, -1
237+
xvstelm.w VX2, YY, 0, 0
238+
add.d YY, YY, INCY
239+
xvstelm.w VX2, YY, 0, 1
240+
add.d YY, YY, INCY
241+
xvstelm.w VX2, YY, 0, 2
242+
add.d YY, YY, INCY
243+
xvstelm.w VX2, YY, 0, 3
244+
add.d YY, YY, INCY
245+
xvstelm.w VX2, YY, 0, 4
246+
add.d YY, YY, INCY
247+
xvstelm.w VX2, YY, 0, 5
248+
add.d YY, YY, INCY
249+
xvstelm.w VX2, YY, 0, 6
250+
add.d YY, YY, INCY
251+
xvstelm.w VX2, YY, 0, 7
252+
#endif
161253
add.d YY, YY, INCY
162254
addi.d X, X, 8 * SIZE
163255
blt $r0, I, .L121
@@ -169,11 +261,11 @@
169261
.align 3
170262

171263
.L123:
172-
fld.d $f12, X, 0 * SIZE
173-
fld.d $f14, Y, 0 * SIZE
264+
LD $f12, X, 0 * SIZE
265+
LD $f14, Y, 0 * SIZE
174266
addi.d I, I, -1
175-
fmadd.d $f14, $f12, $f0, $f14
176-
fst.d $f14, Y, 0 * SIZE
267+
MADD $f14, $f12, $f0, $f14
268+
ST $f14, Y, 0 * SIZE
177269
addi.d X, X, SIZE
178270
add.d Y, Y, INCY
179271
blt $r0, I, .L123
@@ -185,6 +277,7 @@
185277
.align 3
186278

187279
.L211:
280+
#ifdef DOUBLE
188281
xvld VX2, Y, 0 * SIZE
189282
ld.d t1, X, 0 * SIZE
190283
add.d X, X, INCX
@@ -217,6 +310,37 @@
217310
addi.d I, I, -1
218311
xvst VX3, Y, 4 * SIZE
219312
addi.d Y, Y, 8 * SIZE
313+
#else
314+
xvld VX2, Y, 0 * SIZE
315+
ld.w t1, X, 0 * SIZE
316+
add.d X, X, INCX
317+
ld.w t2, X, 0 * SIZE
318+
add.d X, X, INCX
319+
ld.w t3, X, 0 * SIZE
320+
add.d X, X, INCX
321+
ld.w t4, X, 0 * SIZE
322+
xvinsgr2vr.w VX0, t1, 0
323+
xvinsgr2vr.w VX0, t2, 1
324+
xvinsgr2vr.w VX0, t3, 2
325+
xvinsgr2vr.w VX0, t4, 3
326+
add.d X, X, INCX
327+
ld.w t1, X, 0 * SIZE
328+
add.d X, X, INCX
329+
ld.w t2, X, 0 * SIZE
330+
add.d X, X, INCX
331+
ld.w t3, X, 0 * SIZE
332+
add.d X, X, INCX
333+
ld.w t4, X, 0 * SIZE
334+
add.d X, X, INCX
335+
xvinsgr2vr.w VX0, t1, 4
336+
xvinsgr2vr.w VX0, t2, 5
337+
xvinsgr2vr.w VX0, t3, 6
338+
xvinsgr2vr.w VX0, t4, 7
339+
xvfmadd.s VX2, VX0, VXA, VX2
340+
addi.d I, I, -1
341+
xvst VX2, Y, 0 * SIZE
342+
addi.d Y, Y, 8 * SIZE
343+
#endif
220344
blt $r0, I, .L211
221345
.align 3
222346

@@ -226,11 +350,11 @@
226350
.align 3
227351

228352
.L213:
229-
fld.d $f12, X, 0 * SIZE
230-
fld.d $f14, Y, 0 * SIZE
353+
LD $f12, X, 0 * SIZE
354+
LD $f14, Y, 0 * SIZE
231355
addi.d I, I, -1
232-
fmadd.d $f14, $f12, $f0, $f14
233-
fst.d $f14, Y, 0 * SIZE
356+
MADD $f14, $f12, $f0, $f14
357+
ST $f14, Y, 0 * SIZE
234358
add.d X, X, INCX
235359
addi.d Y, Y, SIZE
236360
blt $r0, I, .L213
@@ -243,6 +367,7 @@
243367
.align 3
244368

245369
.L222:
370+
#ifdef DOUBLE
246371
ld.d t1, X, 0 * SIZE
247372
add.d X, X, INCX
248373
ld.d t2, X, 0 * SIZE
@@ -309,6 +434,73 @@
309434
xvstelm.d VX3, YY, 0, 2
310435
add.d YY, YY, INCY
311436
xvstelm.d VX3, YY, 0, 3
437+
#else
438+
ld.w t1, X, 0 * SIZE
439+
add.d X, X, INCX
440+
ld.w t2, X, 0 * SIZE
441+
add.d X, X, INCX
442+
ld.w t3, X, 0 * SIZE
443+
add.d X, X, INCX
444+
ld.w t4, X, 0 * SIZE
445+
add.d X, X, INCX
446+
xvinsgr2vr.w VX0, t1, 0
447+
xvinsgr2vr.w VX0, t2, 1
448+
xvinsgr2vr.w VX0, t3, 2
449+
xvinsgr2vr.w VX0, t4, 3
450+
ld.w t1, Y, 0 * SIZE
451+
add.d Y, Y, INCY
452+
ld.w t2, Y, 0 * SIZE
453+
add.d Y, Y, INCY
454+
ld.w t3, Y, 0 * SIZE
455+
add.d Y, Y, INCY
456+
ld.w t4, Y, 0 * SIZE
457+
add.d Y, Y, INCY
458+
xvinsgr2vr.w VX2, t1, 0
459+
xvinsgr2vr.w VX2, t2, 1
460+
xvinsgr2vr.w VX2, t3, 2
461+
xvinsgr2vr.w VX2, t4, 3
462+
ld.w t1, X, 0 * SIZE
463+
add.d X, X, INCX
464+
ld.w t2, X, 0 * SIZE
465+
add.d X, X, INCX
466+
ld.w t3, X, 0 * SIZE
467+
add.d X, X, INCX
468+
ld.w t4, X, 0 * SIZE
469+
add.d X, X, INCX
470+
xvinsgr2vr.w VX0, t1, 4
471+
xvinsgr2vr.w VX0, t2, 5
472+
xvinsgr2vr.w VX0, t3, 6
473+
xvinsgr2vr.w VX0, t4, 7
474+
ld.w t1, Y, 0 * SIZE
475+
add.d Y, Y, INCY
476+
ld.w t2, Y, 0 * SIZE
477+
add.d Y, Y, INCY
478+
ld.w t3, Y, 0 * SIZE
479+
add.d Y, Y, INCY
480+
ld.w t4, Y, 0 * SIZE
481+
xvinsgr2vr.w VX2, t1, 4
482+
xvinsgr2vr.w VX2, t2, 5
483+
xvinsgr2vr.w VX2, t3, 6
484+
xvinsgr2vr.w VX2, t4, 7
485+
add.d Y, Y, INCY
486+
xvfmadd.s VX2, VX0, VXA, VX2
487+
addi.d I, I, -1
488+
xvstelm.w VX2, YY, 0, 0
489+
add.d YY, YY, INCY
490+
xvstelm.w VX2, YY, 0, 1
491+
add.d YY, YY, INCY
492+
xvstelm.w VX2, YY, 0, 2
493+
add.d YY, YY, INCY
494+
xvstelm.w VX2, YY, 0, 3
495+
add.d YY, YY, INCY
496+
xvstelm.w VX2, YY, 0, 4
497+
add.d YY, YY, INCY
498+
xvstelm.w VX2, YY, 0, 5
499+
add.d YY, YY, INCY
500+
xvstelm.w VX2, YY, 0, 6
501+
add.d YY, YY, INCY
502+
xvstelm.w VX2, YY, 0, 7
503+
#endif
312504
add.d YY, YY, INCY
313505
blt $r0, I, .L222
314506
.align 3
@@ -319,15 +511,14 @@
319511
.align 3
320512

321513
.L224:
322-
fld.d $f12, X, 0 * SIZE
323-
fld.d $f14, Y, 0 * SIZE
514+
LD $f12, X, 0 * SIZE
515+
LD $f14, Y, 0 * SIZE
324516
addi.d I, I, -1
325-
fmadd.d $f14, $f12, $f0, $f14
326-
fst.d $f14, Y, 0 * SIZE
517+
MADD $f14, $f12, $f0, $f14
518+
ST $f14, Y, 0 * SIZE
327519
add.d X, X, INCX
328520
add.d Y, Y, INCY
329521
blt $r0, I, .L224
330-
b .L999
331522
.align 3
332523

333524
.L999:

0 commit comments

Comments
 (0)