Skip to content

Commit 577d480

Browse files
authored
Merge pull request #4529 from ErnstPeng/feature-branch
Optimized sgemv and dgemv kernel LSX for LoongArch
2 parents cfbb701 + b2db064 commit 577d480

File tree

5 files changed

+1016
-0
lines changed

5 files changed

+1016
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,12 @@ ZSWAPKERNEL = cswap_lsx.S
8585
CSUMKERNEL = csum_lsx.S
8686
ZSUMKERNEL = csum_lsx.S
8787

88+
SGEMVNKERNEL = sgemv_n_lsx.S
89+
SGEMVTKERNEL = sgemv_t_lsx.S
90+
91+
DGEMVNKERNEL = dgemv_n_lsx.S
92+
DGEMVTKERNEL = dgemv_t_lsx.S
93+
8894
DGEMMKERNEL = dgemm_kernel_8x4.S
8995
DGEMMINCOPY = dgemm_ncopy_8_lsx.S
9096
DGEMMITCOPY = dgemm_tcopy_8_lsx.S

kernel/loongarch64/dgemv_n_lsx.S

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
/*******************************************************************************
2+
Copyright (c) 2024, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*******************************************************************************/
27+
28+
#define ASSEMBLER
29+
30+
#include "common.h"
31+
32+
/* Param */
33+
#define M $r4
34+
#define N $r5
35+
#define A $r7
36+
#define LDA $r8
37+
#define X $r9
38+
#define INCX $r10
39+
#define Y $r11
40+
#define INCY $r6
41+
#define BUFFER $r16
42+
#define ALPHA $f0
43+
44+
#define YORIG $r18
45+
#define T0 $r19
46+
#define T1 $r20
47+
#define XX $r12
48+
#define YY $r13
49+
#define I $r14
50+
#define J $r15
51+
#define AO1 $r23
52+
#define AO2 $r24
53+
#define IX $r25
54+
#define IY $r26
55+
#define II $r27
56+
#define T2 $r28
57+
#define T3 $r29
58+
#define T4 $r30
59+
60+
/* LSX vectors */
61+
#define U0 $vr11
62+
#define U1 $vr12
63+
#define U2 $vr2
64+
#define U3 $vr3
65+
#define U4 $vr4
66+
#define U5 $vr5
67+
#define U6 $vr6
68+
#define U7 $vr7
69+
#define U8 $vr8
70+
#define U9 $vr9
71+
#define VALPHA $vr10
72+
73+
#define a1 $f3
74+
#define a2 $f4
75+
#define a3 $f5
76+
#define a4 $f6
77+
#define a5 $f7
78+
#define a6 $f8
79+
#define a7 $f9
80+
#define a8 $f10
81+
82+
83+
PROLOGUE
84+
85+
LDARG INCY, $sp, 0
86+
LDARG BUFFER, $sp, 8
87+
88+
addi.d $sp, $sp, -80
89+
90+
SDARG $r23, $sp, 0
91+
SDARG $r24, $sp, 8
92+
SDARG $r25, $sp, 16
93+
SDARG $r26, $sp, 32
94+
SDARG $r27, $sp, 40
95+
SDARG $r28, $sp, 48
96+
SDARG $r29, $sp, 56
97+
SDARG $r30, $sp, 64
98+
ST ALPHA, $sp, 72
99+
100+
vldrepl.d VALPHA, $sp, 72
101+
102+
slli.d LDA, LDA, BASE_SHIFT
103+
slli.d INCX, INCX, BASE_SHIFT
104+
slli.d INCY, INCY, BASE_SHIFT
105+
106+
bge $r0, M, .L999
107+
bge $r0, N, .L999
108+
109+
move J, $r0
110+
move IX, $r0
111+
112+
move AO1, A //a_ptr
113+
move XX, X
114+
move YY, Y
115+
116+
beq J, M, .L999
117+
118+
.L01:
119+
vldx U0, XX, IX
120+
vshuf4i.d U0, U0, 0x00
121+
122+
vfmul.d U1, VALPHA, U0 //temp1
123+
124+
move IY, $r0
125+
move II, $r0
126+
move I, $r0
127+
128+
srai.d T0, M, 2 //n/4
129+
beq I, T0, .L03
130+
131+
.L02:
132+
vldx U2, AO1, II
133+
addi.d II, II, 16
134+
vldx U7, AO1, II
135+
136+
move T1, IY
137+
add.d T2, T1, INCY
138+
add.d T3, T2, INCY
139+
add.d T4, T3, INCY
140+
141+
fldx.d a1, YY, T1
142+
fldx.d a2, YY, T2
143+
fldx.d a3, YY, T3
144+
fldx.d a4, YY, T4
145+
146+
vextrins.d U3, U4, 0x10
147+
vextrins.d U5, U6, 0x10
148+
149+
vfmadd.d U3, U1, U2, U3
150+
vfmadd.d U5, U1, U7, U5
151+
152+
vextrins.d U4, U3, 0x01
153+
vextrins.d U6, U5, 0x01
154+
155+
fstx.d a1, YY, T1
156+
fstx.d a2, YY, T2
157+
fstx.d a3, YY, T3
158+
fstx.d a4, YY, T4
159+
160+
add.d IY, T4, INCY
161+
addi.d II, II, 16
162+
addi.d I, I, 1
163+
blt I, T0, .L02
164+
165+
.L03:
166+
andi T0, M, 2
167+
beq $r0, T0, .L04
168+
169+
addi.d T1, $r0, 4
170+
mod.d T1, M, T1
171+
sub.d II, M, T1
172+
slli.d II, II, BASE_SHIFT
173+
174+
move T1, IY
175+
add.d T2, T1, INCY
176+
177+
vldx U2, AO1, II
178+
179+
fldx.d a1, YY, T1
180+
fldx.d a2, YY, T2
181+
182+
vextrins.d U3, U4, 0x10
183+
184+
vfmadd.d U3, U1, U2, U3
185+
186+
vextrins.d U4, U3, 0x01
187+
188+
fstx.d a1, YY, T1
189+
fstx.d a2, YY, T2
190+
191+
add.d IY, T2, INCY
192+
193+
.L04:
194+
andi T0, M, 1
195+
beq $r0, T0, .L05
196+
197+
addi.d II, M, -1
198+
slli.d II, II, BASE_SHIFT
199+
200+
fldx.d a1, AO1, II
201+
fldx.d a3, YY, IY
202+
203+
fmadd.d a3, $f12, a1, a3
204+
205+
fstx.d a3, YY, IY
206+
207+
add.d IY, IY, INCY
208+
209+
.L05:
210+
add.d AO1, AO1, LDA
211+
add.d IX, IX, INCX
212+
213+
addi.d J, J, 1
214+
blt J, N, .L01
215+
216+
.L999:
217+
LDARG $r23, $sp, 0
218+
LDARG $r24, $sp, 8
219+
LDARG $r25, $sp, 16
220+
LDARG $r26, $sp, 32
221+
LDARG $r27, $sp, 40
222+
LDARG $r28, $sp, 48
223+
LDARG $r29, $sp, 56
224+
LDARG $r30, $sp, 64
225+
LD ALPHA, $sp, 72
226+
addi.d $sp, $sp, 80
227+
jirl $r0, $r1, 0x0
228+
229+
EPILOGUE

0 commit comments

Comments
 (0)