Skip to content

Commit 3f22fc2

Browse files
committed
LoongArch64: Add zgemv LSX opt
1 parent c508a10 commit 3f22fc2

File tree

4 files changed

+570
-0
lines changed

4 files changed

+570
-0
lines changed

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
118118
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
119119
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
120120

121+
ZGEMVNKERNEL = zgemv_n_2_lsx.S
122+
ZGEMVTKERNEL = zgemv_t_2_lsx.S
123+
121124
ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S
122125
ZGEMMONCOPY = zgemm_ncopy_4_lsx.S
123126
ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S

kernel/loongarch64/loongarch64_asm.S

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,9 +406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
406406
.ifeqs "\suf_op", "s"
407407
vpackod.d \out, \in, \in
408408
\pre_op\()add.\suf_op \out, \out, \in
409+
.else
410+
vor.v \out, \in, \in
409411
.endif
410412
.endif
411413

414+
412415
.ifnb \more
413416
GCOMPLEXACC \pre_op, \suf_op, \more
414417
.endif

kernel/loongarch64/zgemv_n_2_lsx.S

Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
/*******************************************************************************
2+
Copyright (c) 2024, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*******************************************************************************/
27+
#define ASSEMBLER
28+
29+
#include "common.h"
30+
#include "loongarch64_asm.S"
31+
32+
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
33+
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
34+
*/
35+
#define M $r4
36+
#define N $r5
37+
#define ALPHA_R $f0
38+
#define ALPHA_I $f1
39+
#define A $r7
40+
#define LDA $r8
41+
#define X $r9
42+
#define INC_X $r10
43+
#define Y $r11
44+
#define INC_Y $r6
45+
46+
#define J $r12
47+
#define I $r13
48+
#define K $r14
49+
#define Y_ORG $r15
50+
#define OFFSET $r16
51+
#define K_LDA $r17
52+
#define M16 $r18
53+
#define T0 $r19
54+
#define PA0 $r20
55+
#define PA1 $r23
56+
#define PA2 $r24
57+
#define PA3 $r25
58+
#define PA4 $r26
59+
#define PA5 $r27
60+
#define PA6 $r28
61+
#define PA7 $r29
62+
63+
#define VALPHA $vr1
64+
#define X0 $vr2
65+
#define X1 $vr3
66+
#define X2 $vr4
67+
#define X3 $vr5
68+
#define X4 $vr6
69+
#define X5 $vr7
70+
#define X6 $vr8
71+
#define X7 $vr9
72+
#define Y0 $vr10
73+
#define Y1 $vr11
74+
#define A0 $vr12
75+
#define A1 $vr13
76+
#define A2 $vr14
77+
#define A3 $vr15
78+
#define A4 $vr16
79+
#define A5 $vr17
80+
#define A6 $vr18
81+
#define A7 $vr19
82+
#define A8 $vr20
83+
#define A9 $vr21
84+
#define A10 $vr22
85+
#define A11 $vr23
86+
#define A12 $vr24
87+
#define A13 $vr25
88+
#define A14 $vr26
89+
#define A15 $vr27
90+
#define TMP0 $vr28
91+
#define TMP1 $vr29
92+
#define TMP2 $vr30
93+
94+
#if !defined(CONJ)
95+
#if !defined(XCONJ)
96+
#define GXCONJ 0
97+
#define GCONJ 0
98+
#else
99+
#define GXCONJ 1
100+
#define GCONJ 0
101+
#endif
102+
#else
103+
#if !defined(XCONJ)
104+
#define GXCONJ 0
105+
#define GCONJ 1
106+
#else
107+
#define GXCONJ 1
108+
#define GCONJ 1
109+
#endif
110+
#endif
111+
112+
.macro ZLOAD_X_2
113+
GLD v, , X0, X, 0x00, X1, X, 0x10
114+
GCOMPLEXMUL GXCONJ, \
115+
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
116+
X1, VALPHA, X1, TMP0, TMP1, TMP2
117+
.endm
118+
119+
.macro ZLOAD_X_2_GAP
120+
vld X0, X, 0
121+
PTR_ADD T0, X, INC_X
122+
vld X1, T0, 0
123+
124+
GCOMPLEXMUL GXCONJ, \
125+
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
126+
X1, VALPHA, X1, TMP0, TMP1, TMP2
127+
.endm
128+
129+
.macro ZLOAD_X_1
130+
GLD v, , X0, X, 0x00
131+
GCOMPLEXMUL GXCONJ, \
132+
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
133+
.endm
134+
135+
.macro ZLOAD_Y_2
136+
GLD v, , Y0, Y, 0, Y1, Y, 0x10
137+
.endm
138+
139+
.macro ZLOAD_Y_2_GAP
140+
vld $vr10, Y, 0
141+
vldx $vr11, Y, INC_Y
142+
.endm
143+
144+
.macro ZLOAD_Y_1
145+
vld $vr10, Y, 0
146+
.endm
147+
148+
.macro ZGEMV_N_2x2
149+
GLD_INC v, , 0x10, \
150+
A0, PA0, 0, A1, PA0, 0, \
151+
A2, PA1, 0, A3, PA1, 0
152+
GCOMPLEXMADD GXCONJ, GCONJ, \
153+
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
154+
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2
155+
.endm
156+
157+
.macro ZGEMV_N_1x2
158+
GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0
159+
GCOMPLEXMADD GXCONJ, GCONJ, \
160+
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
161+
Y0, X1, A2, Y0, TMP0, TMP1, TMP2
162+
.endm
163+
164+
.macro ZGEMV_N_1x1
165+
GLD_INC v, , 0x10, $vr12, PA0, 0
166+
GCOMPLEXMADD GXCONJ, GCONJ, \
167+
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
168+
.endm
169+
170+
.macro ZSTORE_Y_2
171+
GST v, , Y0, Y, 0, Y1, Y, 0x10
172+
.endm
173+
174+
.macro ZSTORE_Y_2_GAP
175+
vst Y0, Y, 0
176+
vstx Y1, Y, INC_Y
177+
.endm
178+
179+
.macro ZSTORE_Y_1
180+
vst $vr10, Y, 0
181+
.endm
182+
183+
.macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req
184+
PTR_SRLI J, N, 1
185+
beqz J, .L_\XW\()_N_1
186+
PTR_SLLI K_LDA, LDA, 1
187+
PTR_SUB K_LDA, K_LDA, M16
188+
.L_\XW\()_N_L2:
189+
ZLOAD_\X_2
190+
xor K, K, K
191+
move Y, Y_ORG
192+
PTR_SRLI I, M, 1
193+
beqz I, .L_\XW\()_M_1
194+
.align 5
195+
.L_\XW\()_M_L2:
196+
ZLOAD_\Y_2
197+
ZGEMV_N_2x2
198+
ZSTORE_\Y_2
199+
PTR_ADDI I, I, -1
200+
PTR_ALSL Y, INC_Y, Y, 1
201+
PTR_ADDI K, K, 4
202+
bnez I, .L_\XW\()_M_L2
203+
.L_\XW\()_M_1:
204+
andi I, M, 1
205+
beqz I, .L_\XW\()_M_END
206+
.align 5
207+
.L_\XW\()_M_L1:
208+
ZLOAD_\Y_1
209+
ZGEMV_N_1x2
210+
ZSTORE_\Y_1
211+
PTR_ADDI I, I, -1
212+
PTR_ADD Y, Y, INC_Y
213+
PTR_ADDI K, K, 1
214+
bnez I, .L_\XW\()_M_L1
215+
.L_\XW\()_M_END:
216+
PTR_ADDI J, J, -1
217+
#if __loongarch_grlen == 64
218+
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
219+
#elif __loongarch_grlen == 32
220+
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
221+
#else
222+
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
223+
#endif
224+
PTR_ALSL X, INC_X, X, 1
225+
bnez J, .L_\XW\()_N_L2
226+
.L_\XW\()_N_1:
227+
andi J, N, 1
228+
beqz J, .L_END
229+
.L_\XW\()_N_L1:
230+
ZLOAD_\X_1
231+
xor K, K, K
232+
move Y, Y_ORG
233+
move I, M
234+
beqz I, .L_END
235+
.align 5
236+
.L_\XW\()_N_1_M_L1:
237+
ZLOAD_\Y_1
238+
ZGEMV_N_1x1
239+
ZSTORE_\Y_1
240+
PTR_ADDI I, I, -1
241+
PTR_ADD Y, Y, INC_Y
242+
PTR_ADDI K, K, 1
243+
bnez I, .L_\XW\()_N_1_M_L1
244+
.L_\XW\()_N_1_M_END:
245+
PTR_ADDI J, J, -1
246+
PTR_SUB K_LDA, LDA, M16
247+
PTR_ADD PA0, PA0, K_LDA
248+
PTR_ADD X, X, INC_X
249+
bnez J, .L_\XW\()_N_L1
250+
251+
b .L_END
252+
.endm
253+
254+
PROLOGUE
255+
PTR_LD INC_Y, $sp, 0
256+
push_if_used 17 + 7, 31
257+
PTR_ADDI K, $r0, 0x01
258+
PTR_SUB I, INC_X, K
259+
PTR_SUB J, INC_Y, K
260+
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
261+
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
262+
PTR_ALSL I, I, J, 1
263+
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
264+
// Init VALPHA
265+
vpackev.d VALPHA, $vr1, $vr0
266+
move Y_ORG, Y
267+
move PA0, A
268+
#if __loongarch_grlen == 64
269+
GADD , d, PA1, PA0, LDA
270+
#elif __loongarch_grlen == 32
271+
GADD , w, PA1, PA0, LDA
272+
#else
273+
GADD , d, PA1, PA0, LDA
274+
#endif
275+
la.local T0, .L_GAP_TABLE
276+
PTR_ALSL I, I, T0, 1
277+
ld.h K, I, 0 // Obtain the offset address
278+
PTR_ADD T0, T0, K
279+
jirl $r0, T0, 0
280+
.L_GAP_TABLE:
281+
.hword .L_GAP_0_0 - .L_GAP_TABLE
282+
.hword .L_GAP_0_1 - .L_GAP_TABLE
283+
.hword .L_GAP_1_0 - .L_GAP_TABLE
284+
.hword .L_GAP_1_1 - .L_GAP_TABLE
285+
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
286+
ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1
287+
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
288+
ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1
289+
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
290+
ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1
291+
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
292+
ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1
293+
.L_END:
294+
pop_if_used 17 + 7, 31
295+
jirl $r0, $r1, 0x0
296+
EPILOGUE

0 commit comments

Comments
 (0)