@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828#define ASSEMBLER
2929
3030#include "common.h"
31+ #include "loongarch64_asm.S"
3132
3233/* Param */
3334#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5758#define T2 $r28
5859#define T3 $r29
5960#define T4 $r30
61+ #define T5 $r17
62+ #define T6 $r16
6063
6164/* LSX vectors */
6265#define U0 $vr31
@@ -88,77 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8891#define a9 $f9
8992
9093
91- PROLOGUE
92-
93- LDARG BUFFER, $sp , 0
94-
95- addi .d $sp , $sp , -88
96-
97- SDARG $r23, $sp , 0
98- SDARG $r24, $sp , 8
99- SDARG $r25, $sp , 16
100- SDARG $r26, $sp , 32
101- SDARG $r27, $sp , 40
102- SDARG $r28, $sp , 48
103- SDARG $r29, $sp , 56
104- SDARG $r30, $sp , 64
105- SDARG $r31, $sp , 72
106- ST ALPHA, $sp , 80
107-
108- vldrepl.w VALPHA, $sp , 80
109-
110- slli.d LDA, LDA, BASE_SHIFT
111- slli.d INCX, INCX, BASE_SHIFT
112- slli.d INCY, INCY, BASE_SHIFT
113-
114- bge $r0, M, .L999
115- bge $r0, N, .L999
116-
117- move J, $r0
118- move JY, $r0
119- move JX, $r0
120- move AO1, A
121-
122- beq J , N, .L999
123-
124- .L01:
125- MTC a2 , $r0 //temp2
126- fldx.s a6 , X, JX
127- fmul .s a3, ALPHA, a6 //temp1
128- vpermi.w U3, U3, 0x00
129- vpermi.w U2, U2, 0x00
130-
131- mul.w T0, J, LDA
132- slli.d T1, J, BASE_SHIFT
133- add.w T0, T0, T1
134- fldx.s a6 , AO1, T0
135- fldx.s a4 , Y, JY
136- fmadd.s a4 , a3 , a6 , a4
137- fstx.s a4 , Y, JY
138-
139- move IY, JY
140- move IX, JX
141- addi .d II, J, 1
142- move I, II
143- slli.d II, II, BASE_SHIFT
144-
145- sub .d T0, M, J
146- addi .d T0, T0, -1
147- srai.d T0, T0, 3
148- add .d T0, T0, J
149- addi .d T0, T0, 1
150- beq I , T0, .L03
151- bge I , T0, .L03
152-
153- mul.w T1, J, LDA
154- add .d T1, T1, II
155-
156- .L02: /* /8 */
157- vldx U1, AO1, T1
158- addi .d T1, T1, 16
159- vldx U14, AO1, T1
160- addi .d T1, T1, 16
161-
94+ .macro LOAD_Y_8
95+ beqz T5, .L01_Y_0
16296 add .d T2, IY, INCY
16397 fldx.s $f4 , Y, T2
16498 add .d T2, T2, INCY
@@ -183,10 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
183117 vextrins.w U8, U9, 0x10
184118 vextrins.w U8, U10, 0x20
185119 vextrins.w U8, U11, 0x30
186-
187- vfmadd.s U4, U3, U1, U4
188- vfmadd.s U8, U3, U14, U8
189-
120+ b .L01_Y_1
121+ .L01_Y_0:
122+ add .d T3, IY, INCY
123+ vldx U4, Y, T3
124+ alsl.d T4, INCY, T3, 2
125+ vldx U8, Y, T4
126+ .L01_Y_1:
127+ .endm
128+
129+ .macro STORE_Y_8
130+ beqz T5, .L01_Y_2
190131 vextrins.w U5, U4, 0x01
191132 vextrins.w U6, U4, 0x02
192133 vextrins.w U7, U4, 0x03
@@ -211,10 +152,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
211152 fstx.s $f10 , Y, T2
212153 add .d T2, T2, INCY
213154 fstx.s $f11 , Y, T2
214-
215- slli.d T2, INCY, 3
216- add .d IY, IY, T2
217-
155+ b .L01_Y_3
156+ .L01_Y_2:
157+ vstx U4, Y, T3
158+ vstx U8, Y, T4
159+ .L01_Y_3:
160+ .endm
161+
162+ .macro LOAD_X_8
163+ beqz T6, .L01_X_0
218164 add .d T2, IX, INCX
219165 fldx.s $f4 , X, T2
220166 add .d T2, T2, INCX
@@ -239,31 +185,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
239185 vextrins.w $vr8, $vr9, 0x10
240186 vextrins.w $vr8, $vr10, 0x20
241187 vextrins.w $vr8, $vr11, 0x30
188+ b .L01_X_1
189+ .L01_X_0:
190+ add .d T3, IX, INCX
191+ vldx U4, X, T3
192+ alsl.d T4, INCX, T3, 2
193+ vldx U8, X, T4
194+ .L01_X_1:
195+ .endm
242196
243- vand.v $vr12, $vr2, $vr2
197+ PROLOGUE
244198
245- vfmadd.s U2, U1, U4, U2
246- vfsub.s U2, U2, $vr12
247- vfmadd.s U2, U14, U8, U2
199+ addi .d $sp , $sp , -88
248200
249- vextrins.w U4, U2, 0x01
250- vextrins.w U5, U2, 0x02
251- vextrins.w U6, U2, 0x03
201+ SDARG $r23, $sp , 0
202+ SDARG $r24, $sp , 8
203+ SDARG $r25, $sp , 16
204+ SDARG $r26, $sp , 32
205+ SDARG $r27, $sp , 40
206+ SDARG $r28, $sp , 48
207+ SDARG $r29, $sp , 56
208+ SDARG $r30, $sp , 64
209+ SDARG $r31, $sp , 72
210+ ST ALPHA, $sp , 80
252211
253- fadd .s $f2, $f2, $f4
254- fadd .s $f2, $f2, $f5
255- fadd .s $f2, $f2, $f6
256- fadd .s $f2, $f2, $f12
212+ vldrepl.w VALPHA, $sp , 80
257213
258- vpermi.w U2, U2, 0x00
214+ addi .d T5, INCY, -1
215+ addi .d T6, INCX, -1
216+ slli.d LDA, LDA, BASE_SHIFT
217+ slli.d INCX, INCX, BASE_SHIFT
218+ slli.d INCY, INCY, BASE_SHIFT
219+
220+ bge $r0, M, .L999
221+ bge $r0, N, .L999
222+
223+ move J, $r0
224+ move JY, $r0
225+ move JX, $r0
226+ move AO1, A
227+
228+ beq J , N, .L999
229+
230+ .L01:
231+ vxor.v U2, U2, U2
232+ fldx.s a6 , X, JX
233+ fmul .s a3, ALPHA, a6 //temp1
234+ vpermi.w U3, U3, 0x00
235+
236+ mul.w T0, J, LDA
237+ slli.d T1, J, BASE_SHIFT
238+ add.w T0, T0, T1
239+ fldx.s a6 , AO1, T0
240+ fldx.s a4 , Y, JY
241+ fmadd.s a4 , a3 , a6 , a4
242+ fstx.s a4 , Y, JY
243+
244+ move IY, JY
245+ move IX, JX
246+ addi .d II, J, 1
247+ move I, II
248+ slli.d II, II, BASE_SHIFT
259249
260- slli.d T2, INCX, 3
261- add .d IX, IX, T2
250+ sub .d T0, M, J
251+ addi .d T0, T0, -1
252+ srai.d T0, T0, 3
253+ add .d T0, T0, J
254+ addi .d T0, T0, 1
255+ beq I , T0, .L03
256+ bge I , T0, .L03
257+
258+ mul.w T1, J, LDA
259+ add .d T1, T1, II
260+
261+ .L02: /* /8 */
262+ vldx U1, AO1, T1
263+ addi .d T1, T1, 16
264+ vldx U14, AO1, T1
265+ addi .d T1, T1, 16
266+
267+ LOAD_Y_8
268+
269+ vfmadd.s U4, U3, U1, U4
270+ vfmadd.s U8, U3, U14, U8
271+
272+ STORE_Y_8
273+
274+ alsl.d IY, INCY, IY, 3
275+
276+ LOAD_X_8
277+
278+ vfmadd.s U2, U1, U4, U2
279+ vfmadd.s U2, U14, U8, U2
280+
281+ alsl.d IX, INCX, IX, 3
262282
263283 addi .d II, II, 32
264284 addi .d I, I, 1
265285 blt I , T0, .L02
266286
287+ // Acc U2
288+ GACC vf, s, U4, U2
289+ vpermi.w U2, U4, 0
290+
267291.L03: /* &4 */
268292 sub .d T0, M, J
269293 addi .d T0, T0, -1
@@ -426,4 +450,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
426450 addi .d $sp , $sp , 88
427451 jirl $r0, $r1, 0x0
428452
429- EPILOGUE
453+ EPILOGUE
0 commit comments