Skip to content

Commit cfbb701

Browse files
authored
Merge pull request #4536 from XiWeiGu/loongarch64-cgemv-zgemv-opt
Loongarch64 cgemv zgemv opt
2 parents 5fbe259 + 8e05c05 commit cfbb701

File tree

8 files changed

+1212
-26
lines changed

8 files changed

+1212
-26
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
100100
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
101101
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
102102

103+
CGEMVNKERNEL = cgemv_n_4_lsx.S
104+
CGEMVTKERNEL = cgemv_t_4_lsx.S
105+
103106
CGEMMKERNEL = cgemm_kernel_8x4_lsx.S
104107
CGEMMINCOPY = cgemm_ncopy_8_lsx.S
105108
CGEMMITCOPY = cgemm_tcopy_8_lsx.S
@@ -115,6 +118,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
115118
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
116119
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
117120

121+
ZGEMVNKERNEL = zgemv_n_2_lsx.S
122+
ZGEMVTKERNEL = zgemv_t_2_lsx.S
123+
118124
ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S
119125
ZGEMMONCOPY = zgemm_ncopy_4_lsx.S
120126
ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S

kernel/loongarch64/cgemv_n_4_lsx.S

Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
/*******************************************************************************
2+
Copyright (c) 2024, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*******************************************************************************/
27+
#define ASSEMBLER
28+
29+
#include "common.h"
30+
#include "loongarch64_asm.S"
31+
32+
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
33+
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
34+
*/
35+
#define M $r4
36+
#define N $r5
37+
#define ALPHA_R $f0
38+
#define ALPHA_I $f1
39+
#define A $r7
40+
#define LDA $r8
41+
#define X $r9
42+
#define INC_X $r10
43+
#define Y $r11
44+
#define INC_Y $r6
45+
46+
#define J $r12
47+
#define I $r13
48+
#define K $r14
49+
#define Y_ORG $r15
50+
#define OFFSET $r16
51+
#define K_LDA $r17
52+
#define M8 $r18
53+
#define T0 $r19
54+
#define PA0 $r20
55+
#define PA1 $r23
56+
#define PA2 $r24
57+
#define PA3 $r25
58+
#define PA4 $r26
59+
#define PA5 $r27
60+
#define PA6 $r28
61+
#define PA7 $r29
62+
63+
#define VALPHA $vr1
64+
#define X0 $vr2
65+
#define X1 $vr3
66+
#define X2 $vr4
67+
#define X3 $vr5
68+
#define X4 $vr6
69+
#define X5 $vr7
70+
#define X6 $vr8
71+
#define X7 $vr9
72+
#define Y0 $vr10
73+
#define Y1 $vr11
74+
#define A0 $vr12
75+
#define A1 $vr13
76+
#define A2 $vr14
77+
#define A3 $vr15
78+
#define A4 $vr16
79+
#define A5 $vr17
80+
#define A6 $vr18
81+
#define A7 $vr19
82+
#define A8 $vr20
83+
#define A9 $vr21
84+
#define A10 $vr22
85+
#define A11 $vr23
86+
#define A12 $vr24
87+
#define A13 $vr25
88+
#define A14 $vr26
89+
#define A15 $vr27
90+
#define TMP0 $vr28
91+
#define TMP1 $vr29
92+
#define TMP2 $vr30
93+
94+
#if !defined(CONJ)
95+
#if !defined(XCONJ)
96+
#define GXCONJ 0
97+
#define GCONJ 0
98+
#else
99+
#define GXCONJ 1
100+
#define GCONJ 0
101+
#endif
102+
#else
103+
#if !defined(XCONJ)
104+
#define GXCONJ 0
105+
#define GCONJ 1
106+
#else
107+
#define GXCONJ 1
108+
#define GCONJ 1
109+
#endif
110+
#endif
111+
112+
.macro CLOAD_X_4
113+
GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
114+
GCOMPLEXMUL GXCONJ, \
115+
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
116+
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
117+
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
118+
X3, VALPHA, X3, TMP0, TMP1, TMP2
119+
.endm
120+
121+
.macro CLOAD_X_4_GAP
122+
vldrepl.d X0, X, 0x00
123+
PTR_ADD T0, X, INC_X
124+
vldrepl.d X1, T0, 0x00
125+
PTR_ADD T0, T0, INC_X
126+
vldrepl.d X2, T0, 0x00
127+
PTR_ADD T0, T0, INC_X
128+
vldrepl.d X3, T0, 0x00
129+
130+
GCOMPLEXMUL GXCONJ, \
131+
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
132+
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
133+
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
134+
X3, VALPHA, X3, TMP0, TMP1, TMP2
135+
.endm
136+
137+
.macro CLOAD_X_1
138+
GLDREPL v, d, X0, X, 0x00
139+
GCOMPLEXMUL GXCONJ, \
140+
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
141+
.endm
142+
143+
.macro CLOAD_Y_4
144+
GLD v, , Y0, Y, 0, Y1, Y, 0x10
145+
.endm
146+
147+
.macro CLOAD_Y_4_GAP
148+
fld.d $f10, Y, 0
149+
fldx.d $f13, Y, INC_Y
150+
PTR_ALSL T0, INC_Y, Y, 1
151+
fld.d $f11, T0, 0
152+
fldx.d $f17, T0, INC_Y
153+
vpackev.d Y0, A1, Y0
154+
vpackev.d Y1, A5, Y1
155+
.endm
156+
157+
.macro CLOAD_Y_1
158+
fld.d $f10, Y, 0
159+
.endm
160+
161+
.macro CSTORE_Y_4
162+
GST v, , Y0, Y, 0, Y1, Y, 0x10
163+
.endm
164+
165+
.macro CSTORE_Y_4_GAP
166+
vstelm.d Y0, Y, 0, 0
167+
PTR_ADD T0, Y, INC_Y
168+
vstelm.d Y0, T0, 0, 1
169+
PTR_ADD T0, T0, INC_Y
170+
vstelm.d Y1, T0, 0, 0
171+
PTR_ADD T0, T0, INC_Y
172+
vstelm.d Y1, T0, 0, 1
173+
.endm
174+
175+
.macro CSTORE_Y_1
176+
fst.d $f10, Y, 0
177+
.endm
178+
179+
.macro CGEMV_N_4x4
180+
GLD_INC v, , 0x10, \
181+
A0, PA0, 0, A1, PA0, 0, \
182+
A2, PA1, 0, A3, PA1, 0, \
183+
A4, PA2, 0, A5, PA2, 0, \
184+
A6, PA3, 0, A7, PA3, 0
185+
186+
GCOMPLEXMADD GXCONJ, GCONJ, \
187+
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
188+
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
189+
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
190+
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2
191+
.endm
192+
193+
.macro CGEMV_N_1x4
194+
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
195+
GCOMPLEXMADD GXCONJ, GCONJ, \
196+
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
197+
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
198+
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
199+
Y0, X3, A6, Y0, TMP0, TMP1, TMP2
200+
.endm
201+
202+
.macro CGEMV_N_1x1
203+
fld.d $f12, PA0, 0
204+
PTR_ADDI PA0, PA0, 0x08
205+
GCOMPLEXMADD GXCONJ, GCONJ, \
206+
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
207+
.endm
208+
209+
.macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
210+
PTR_SRLI J, N, 2
211+
beqz J, .L_\XW\()_N_3
212+
PTR_SLLI K_LDA, LDA, 2
213+
PTR_SUB K_LDA, K_LDA, M8
214+
.L_\XW\()_N_L4:
215+
CLOAD_\X_4
216+
xor K, K, K
217+
move Y, Y_ORG
218+
PTR_SRLI I, M, 2
219+
beqz I, .L_\XW\()_M_3
220+
.align 5
221+
.L_\XW\()_M_L4:
222+
CLOAD_\Y_4
223+
CGEMV_N_4x4
224+
CSTORE_\Y_4
225+
PTR_ADDI I, I, -1
226+
PTR_ALSL Y, INC_Y, Y, 2
227+
PTR_ADDI K, K, 4
228+
bnez I, .L_\XW\()_M_L4
229+
.L_\XW\()_M_3:
230+
andi I, M, 3
231+
beqz I, .L_\XW\()_M_END
232+
.align 5
233+
.L_\XW\()_M_L1:
234+
CLOAD_\Y_1
235+
CGEMV_N_1x4
236+
CSTORE_\Y_1
237+
PTR_ADDI I, I, -1
238+
PTR_ADD Y, Y, INC_Y
239+
PTR_ADDI K, K, 1
240+
bnez I, .L_\XW\()_M_L1
241+
.L_\XW\()_M_END:
242+
PTR_ADDI J, J, -1
243+
#if __loongarch_grlen == 64
244+
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
245+
#elif __loongarch_grlen == 32
246+
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
247+
#else
248+
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
249+
#endif
250+
PTR_ALSL X, INC_X, X, 2
251+
bnez J, .L_\XW\()_N_L4
252+
.L_\XW\()_N_3:
253+
andi J, N, 3
254+
beqz J, .L_END
255+
.L_\XW\()_N_L1:
256+
CLOAD_\X_1
257+
xor K, K, K
258+
move Y, Y_ORG
259+
move I, M
260+
beqz I, .L_END
261+
.align 5
262+
.L_\XW\()_N_1_M_L1:
263+
CLOAD_\Y_1
264+
CGEMV_N_1x1
265+
CSTORE_\Y_1
266+
PTR_ADDI I, I, -1
267+
PTR_ADD Y, Y, INC_Y
268+
PTR_ADDI K, K, 1
269+
bnez I, .L_\XW\()_N_1_M_L1
270+
.L_\XW\()_N_1_M_END:
271+
PTR_ADDI J, J, -1
272+
PTR_SUB K_LDA, LDA, M8
273+
PTR_ADD PA0, PA0, K_LDA
274+
PTR_ADD X, X, INC_X
275+
bnez J, .L_\XW\()_N_L1
276+
277+
b .L_END
278+
.endm
279+
280+
PROLOGUE
281+
PTR_LD INC_Y, $sp, 0
282+
push_if_used 17 + 7, 31
283+
PTR_ADDI K, $r0, 0x01
284+
PTR_SUB I, INC_X, K
285+
PTR_SUB J, INC_Y, K
286+
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
287+
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
288+
PTR_ALSL I, I, J, 1
289+
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
290+
// Init VALPHA
291+
vpackev.w $vr0, $vr1, $vr0
292+
vpackev.d VALPHA, $vr0, $vr0
293+
move Y_ORG, Y
294+
move PA0, A
295+
#if __loongarch_grlen == 64
296+
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
297+
#elif __loongarch_grlen == 32
298+
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
299+
#else
300+
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
301+
#endif
302+
la.local T0, .L_GAP_TABLE
303+
PTR_ALSL I, I, T0, 1
304+
ld.h K, I, 0 // Obtain the offset address
305+
PTR_ADD T0, T0, K
306+
jirl $r0, T0, 0
307+
.L_GAP_TABLE:
308+
.hword .L_GAP_0_0 - .L_GAP_TABLE
309+
.hword .L_GAP_0_1 - .L_GAP_TABLE
310+
.hword .L_GAP_1_0 - .L_GAP_TABLE
311+
.hword .L_GAP_1_1 - .L_GAP_TABLE
312+
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
313+
CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1
314+
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
315+
CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
316+
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
317+
CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
318+
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
319+
CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
320+
.L_END:
321+
pop_if_used 17 + 7, 31
322+
jirl $r0, $r1, 0x0
323+
EPILOGUE

kernel/loongarch64/cgemv_n_8_lasx.S

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
122122
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
123123
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
124124
GCOMPLEXMUL GXCONJ, \
125-
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
126-
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
127-
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
128-
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
129-
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
130-
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
131-
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
132-
X7, X7, VALPHA, TMP0, TMP1, TMP2
125+
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
126+
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
127+
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
128+
X3, VALPHA, X3, TMP0, TMP1, TMP2, \
129+
X4, VALPHA, X4, TMP0, TMP1, TMP2, \
130+
X5, VALPHA, X5, TMP0, TMP1, TMP2, \
131+
X6, VALPHA, X6, TMP0, TMP1, TMP2, \
132+
X7, VALPHA, X7, TMP0, TMP1, TMP2
133133
.endm
134134

135135
.macro CLOAD_X_8_GAP
@@ -150,14 +150,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
150150
xvldrepl.d X7, T0, 0x00
151151

152152
GCOMPLEXMUL GXCONJ, \
153-
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
154-
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
155-
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
156-
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
157-
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
158-
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
159-
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
160-
X7, X7, VALPHA, TMP0, TMP1, TMP2
153+
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
154+
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
155+
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
156+
X3, VALPHA, X3, TMP0, TMP1, TMP2, \
157+
X4, VALPHA, X4, TMP0, TMP1, TMP2, \
158+
X5, VALPHA, X5, TMP0, TMP1, TMP2, \
159+
X6, VALPHA, X6, TMP0, TMP1, TMP2, \
160+
X7, VALPHA, X7, TMP0, TMP1, TMP2
161161
.endm
162162

163163
.macro CLOAD_Y_8
@@ -228,7 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
228228
.macro CLOAD_X_1
229229
GLDREPL xv, d, X0, X, 0x00
230230
GCOMPLEXMUL GXCONJ, \
231-
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2
231+
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
232232
.endm
233233

234234
.macro CLOAD_Y_1

0 commit comments

Comments
 (0)