Skip to content

Commit e771be1

Browse files
XiWeiGuyinshiyou
authored andcommitted
Optimize copy functions with lsx.
Signed-off-by: Hao Chen <[email protected]>
1 parent 179ed51 commit e771be1

File tree

5 files changed

+1349
-4
lines changed

5 files changed

+1349
-4
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,10 @@ SNRM2KERNEL = snrm2_lsx.S
5959
DNRM2KERNEL = dnrm2_lsx.S
6060

6161
DGEMMKERNEL = dgemm_kernel_8x4.S
62-
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
63-
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
64-
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
65-
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
62+
DGEMMINCOPY = dgemm_ncopy_8_lsx.S
63+
DGEMMITCOPY = dgemm_tcopy_8_lsx.S
64+
DGEMMONCOPY = dgemm_ncopy_4_lsx.S
65+
DGEMMOTCOPY = dgemm_tcopy_4_lsx.S
6666
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
6767
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
6868
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
/*******************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*******************************************************************************/
27+
#define ASSEMBLER
28+
29+
#include "common.h"
30+
#include "loongarch64_asm.S"
31+
32+
/* Function parameters */
33+
#define M $r4 // param 1: m
34+
#define N $r5 // param 2: n
35+
#define SRC $r6 // param 3: src
36+
#define LDA $r7 // param 4: lda
37+
#define DST $r8 // param 5: dst
38+
39+
#define I $r9
40+
#define J $r10
41+
#define S1 $r12
42+
#define S2 $r13
43+
#define S3 $r14
44+
#define S4 $r15
45+
#define S5 $r16
46+
#define S6 $r17
47+
#define S7 $r18
48+
#define S8 $r19
49+
#define TD $r20
50+
#define TS $r21
51+
#define TL $r7
52+
#define T0 $r6
53+
#define ZERO $r0
54+
55+
#define F0 $f0
56+
#define F1 $f1
57+
#define F2 $f2
58+
#define F3 $f3
59+
#define F4 $f4
60+
#define F5 $f5
61+
#define F6 $f6
62+
#define F7 $f7
63+
/* LSX vectors */
64+
#define U0 $vr0
65+
#define U1 $vr1
66+
#define U2 $vr2
67+
#define U3 $vr3
68+
#define U4 $vr4
69+
#define U5 $vr5
70+
#define U6 $vr6
71+
#define U7 $vr7
72+
#define D0 $vr8
73+
#define D1 $vr9
74+
#define D2 $vr10
75+
#define D3 $vr11
76+
#define D4 $vr12
77+
#define D5 $vr13
78+
#define D6 $vr14
79+
#define D7 $vr15
80+
81+
PROLOGUE
82+
83+
move TD, DST
84+
move TS, SRC
85+
slli.d TL, LDA, 0x03
86+
slli.d T0, TL, 0x01
87+
srai.d J, N, 0x02
88+
beq J, ZERO, .L_N2
89+
.L_J1: /* J-- */
90+
move S1, TS
91+
add.d S2, TS, TL
92+
srai.d I, M, 0x02
93+
add.d S3, S2, TL
94+
add.d S4, S2, T0
95+
add.d TS, S3, T0
96+
addi.d J, J, -1
97+
beq I, ZERO, .L_I3
98+
.L_I1: /* I-- */
99+
GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00
100+
GINTERLACE v, d, D0, D2, U1, U0
101+
GINTERLACE v, d, D1, D3, U3, U2
102+
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
103+
addi.d TD, TD, 0x40
104+
105+
GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10
106+
GINTERLACE v, d, D0, D2, U1, U0
107+
GINTERLACE v, d, D1, D3, U3, U2
108+
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
109+
110+
addi.d S1, S1, 0x20
111+
addi.d S2, S2, 0x20
112+
addi.d S3, S3, 0x20
113+
addi.d S4, S4, 0x20
114+
addi.d TD, TD, 0x40
115+
116+
addi.d I, I, -1
117+
blt ZERO, I, .L_I1
118+
.L_I3:
119+
andi I, M, 0x03
120+
beq I, ZERO, .L_I0
121+
.L_II1:
122+
fld.d F0, S1, 0x00
123+
fld.d F1, S2, 0x00
124+
fld.d F2, S3, 0x00
125+
fld.d F3, S4, 0x00
126+
127+
fst.d F0, TD, 0x00
128+
addi.d S1, S1, 0x08
129+
fst.d F1, TD, 0x08
130+
addi.d S2, S2, 0x08
131+
fst.d F2, TD, 0x10
132+
addi.d S3, S3, 0x08
133+
fst.d F3, TD, 0x18
134+
addi.d S4, S4, 0x08
135+
136+
addi.d TD, TD, 0x20
137+
addi.d I, I, -1
138+
blt ZERO, I, .L_II1
139+
.L_I0:
140+
blt ZERO, J, .L_J1
141+
.L_N2:
142+
andi J, N, 0x02
143+
beq ZERO, J, .L_N1
144+
145+
move S1, TS
146+
add.d S2, TS, TL
147+
srai.d I, M, 0x01
148+
add.d TS, S2, TL
149+
beq I, ZERO, .L_2I3
150+
.L_2I1: /* I-- */
151+
GLD v, , U0, S1, 0x00, U1, S2, 0x00
152+
GINTERLACE v, d, D0, D1, U1, U0
153+
GST v, , D0, TD, 0x00, D1, TD, 0x10
154+
addi.d S1, S1, 0x10
155+
addi.d S2, S2, 0x10
156+
addi.d TD, TD, 0x20
157+
158+
addi.d I, I, -1
159+
blt ZERO, I, .L_2I1
160+
.L_2I3:
161+
andi I, M, 0x01
162+
beq ZERO, I, .L_N1
163+
.L_2II1: /* I-- */
164+
fld.d F0, S1, 0x00
165+
fld.d F1, S2, 0x00
166+
fst.d F0, TD, 0x00
167+
addi.d I, I, -1
168+
fst.d F1, TD, 0x08
169+
addi.d S1, S1, 0x08
170+
addi.d S2, S2, 0x08
171+
addi.d TD, TD, 0x10
172+
blt ZERO, I, .L_2II1
173+
.L_N1:
174+
move S1, TS
175+
beq ZERO, M, .L_N0
176+
.L_M1:
177+
fld.d F0, S1, 0x00
178+
addi.d S1, S1, 0x08
179+
fst.d F0, TD, 0x00
180+
addi.d TD, TD, 0x08
181+
addi.d M, M, -1
182+
blt ZERO, M, .L_M1
183+
.L_N0:
184+
jirl $r0, $r1, 0x00
185+
EPILOGUE

0 commit comments

Comments
 (0)