@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828#define ASSEMBLER
2929
3030#include "common.h"
31+ #include "loongarch64_asm.S"
3132
3233/* Param */
3334#define M $r4
@@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5758#define T2 $r28
5859#define T3 $r29
5960#define T4 $r30
61+ #define T5 $r17
62+ #define T6 $r16
63+ #define T7 $r12
6064
6165/* LSX vectors */
6266#define U0 $vr31
@@ -87,10 +91,114 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8791#define a8 $f8
8892#define a9 $f9
8993
94+ .macro LOAD_Y_8
95+ beqz T5, .L01_Y_0
96+ add .d T2, IY, INCY
97+ fldx.d $f4, Y, T2
98+ add .d T2, T2, INCY
99+ fldx.d $f5, Y, T2
100+ add .d T2, T2, INCY
101+ fldx.d $f6, Y, T2
102+ add .d T2, T2, INCY
103+ fldx.d $f7, Y, T2
90104
91- PROLOGUE
105+ add .d T2, T2, INCY
106+ fldx.d $f8, Y, T2
107+ add .d T2, T2, INCY
108+ fldx.d $f9, Y, T2
109+ add .d T2, T2, INCY
110+ fldx.d $f10, Y, T2
111+ add .d T2, T2, INCY
112+ fldx.d $f11, Y, T2
113+
114+ vextrins.d U4, U5, 0x10
115+ vextrins.d U6, U7, 0x10
116+ vextrins.d U8, U9, 0x10
117+ vextrins.d U10, U11, 0x10
118+ b .L01_Y_1
119+ .L01_Y_0:
120+ add .d T7, IY, INCY
121+ vldx U4, Y, T7
122+ alsl.d T2, INCY, T7, 1
123+ vldx U6, Y, T2
124+ alsl.d T3, INCY, T2, 1
125+ vldx U8, Y, T3
126+ alsl.d T4, INCY, T3, 1
127+ vldx U10, Y, T4
128+ .L01_Y_1:
129+ .endm
130+
131+ .macro LOAD_X_8
132+ beqz T6, .L01_X_0
133+ add .d T2, IX, INCX
134+ fldx.d $f4, X, T2
135+ add .d T2, T2, INCX
136+ fldx.d $f5, X, T2
137+ add .d T2, T2, INCX
138+ fldx.d $f6, X, T2
139+ add .d T2, T2, INCX
140+ fldx.d $f7, X, T2
141+
142+ add .d T2, T2, INCX
143+ fldx.d $f8, X, T2
144+ add .d T2, T2, INCX
145+ fldx.d $f9, X, T2
146+ add .d T2, T2, INCX
147+ fldx.d $f10, X, T2
148+ add .d T2, T2, INCX
149+ fldx.d $f11, X, T2
150+
151+ vextrins.d U4, U5, 0x10
152+ vextrins.d U6, U7, 0x10
153+ vextrins.d U8, U9, 0x10
154+ vextrins.d U10, U11, 0x10
155+ b .L01_X_1
156+ .L01_X_0:
157+ add .d T7, IX, INCX
158+ vldx U4, X, T7
159+ alsl.d T2, INCX, T7, 1
160+ vldx U6, X, T2
161+ alsl.d T3, INCX, T2, 1
162+ vldx U8, X, T3
163+ alsl.d T4, INCX, T3, 1
164+ vldx U10, X, T4
165+ .L01_X_1:
166+ .endm
167+
168+ .macro STORE_Y_8
169+ beqz T5, .L01_Y_2
170+ vextrins.d U5, U4, 0x01
171+ vextrins.d U7, U6, 0x01
172+ vextrins.d U9, U8, 0x01
173+ vextrins.d U11, U10, 0x01
174+
175+ add .d T2, IY, INCY
176+ fstx.d $f4, Y, T2
177+ add .d T2, T2, INCY
178+ fstx.d $f5, Y, T2
179+ add .d T2, T2, INCY
180+ fstx.d $f6, Y, T2
181+ add .d T2, T2, INCY
182+ fstx.d $f7, Y, T2
183+
184+ add .d T2, T2, INCY
185+ fstx.d $f8, Y, T2
186+ add .d T2, T2, INCY
187+ fstx.d $f9, Y, T2
188+ add .d T2, T2, INCY
189+ fstx.d $f10, Y, T2
190+ add .d T2, T2, INCY
191+ fstx.d $f11, Y, T2
192+ b .L01_Y_3
193+ .L01_Y_2:
194+ vstx U4, Y, T7
195+ vstx U6, Y, T2
196+ vstx U8, Y, T3
197+ vstx U10, Y, T4
198+ .L01_Y_3:
199+ .endm
92200
93- LDARG BUFFER, $sp, 0
201+ PROLOGUE
94202
95203 addi.d $sp, $sp, -88
96204
@@ -107,6 +215,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
107215
108216 vldrepl.d VALPHA, $sp, 80
109217
218+ addi.d T5, INCY, -1
219+ addi.d T6, INCX, -1
110220 slli.d LDA, LDA, BASE_SHIFT
111221 slli.d INCX, INCX, BASE_SHIFT
112222 slli.d INCY, INCY, BASE_SHIFT
@@ -122,11 +232,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
122232 beq J, N, .L999
123233
124234.L01:
125- MTC a2 , $r0 //temp2
235+ vxor.v U2, U2 , U2
126236 fldx.d a6, X, JX
127237 fmul .d a3, ALPHA, a6 //temp1
128238 vshuf4i.d U3, U3, 0x00
129- vshuf4i.d U2, U2, 0x00
130239
131240 mul .d T0, J, LDA
132241 slli.d T1, J, BASE_SHIFT
@@ -163,105 +272,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
163272 vldx U16, AO1, T1
164273 addi.d T1, T1, 16
165274
166- add .d T2, IY, INCY
167- fldx.d $f4, Y, T2
168- add .d T2, T2, INCY
169- fldx.d $f5, Y, T2
170- add .d T2, T2, INCY
171- fldx.d $f6, Y, T2
172- add .d T2, T2, INCY
173- fldx.d $f7, Y, T2
174-
175- add .d T2, T2, INCY
176- fldx.d $f8, Y, T2
177- add .d T2, T2, INCY
178- fldx.d $f9, Y, T2
179- add .d T2, T2, INCY
180- fldx.d $f10, Y, T2
181- add .d T2, T2, INCY
182- fldx.d $f11, Y, T2
183-
184- vextrins.d U4, U5, 0x10
185- vextrins.d U6, U7, 0x10
186- vextrins.d U8, U9, 0x10
187- vextrins.d U10, U11, 0x10
275+ LOAD_Y_8
188276
189277 vfmadd.d U4, U3, U1, U4
190278 vfmadd.d U6, U3, U14, U6
191279 vfmadd.d U8, U3, U15, U8
192280 vfmadd.d U10, U3, U16, U10
193281
194- vextrins.d U5, U4, 0x01
195- vextrins.d U7, U6, 0x01
196- vextrins.d U9, U8, 0x01
197- vextrins.d U11, U10, 0x01
198-
199- add .d T2, IY, INCY
200- fstx.d $f4, Y, T2
201- add .d T2, T2, INCY
202- fstx.d $f5, Y, T2
203- add .d T2, T2, INCY
204- fstx.d $f6, Y, T2
205- add .d T2, T2, INCY
206- fstx.d $f7, Y, T2
207-
208- add .d T2, T2, INCY
209- fstx.d $f8, Y, T2
210- add .d T2, T2, INCY
211- fstx.d $f9, Y, T2
212- add .d T2, T2, INCY
213- fstx.d $f10, Y, T2
214- add .d T2, T2, INCY
215- fstx.d $f11, Y, T2
216-
217- slli.d T2, INCY, 3
218- add .d IY, IY, T2
219-
220- add .d T2, IX, INCX
221- fldx.d $f4, X, T2
222- add .d T2, T2, INCX
223- fldx.d $f5, X, T2
224- add .d T2, T2, INCX
225- fldx.d $f6, X, T2
226- add .d T2, T2, INCX
227- fldx.d $f7, X, T2
228-
229- add .d T2, T2, INCX
230- fldx.d $f8, X, T2
231- add .d T2, T2, INCX
232- fldx.d $f9, X, T2
233- add .d T2, T2, INCX
234- fldx.d $f10, X, T2
235- add .d T2, T2, INCX
236- fldx.d $f11, X, T2
282+ STORE_Y_8
237283
238- vextrins.d U4, U5, 0x10
239- vextrins.d U6, U7, 0x10
240- vextrins.d U8, U9, 0x10
241- vextrins.d U10, U11, 0x10
284+ alsl.d IY, INCY, IY, 3
242285
243- vand.v $vr12, $vr2, $vr2
286+ LOAD_X_8
244287
245288 vfmadd.d U2, U1, U4, U2
246- vfsub.d U2, U2, $vr12
247289 vfmadd.d U2, U14, U6, U2
248290 vfmadd.d U2, U15, U8, U2
249291 vfmadd.d U2, U16, U10, U2
250292
251- vextrins.d U4, U2, 0x01
252-
253- fadd .d $f2, $f2, $f4
254- fadd .d $f2, $f2, $f12
255-
256- vextrins.d U2, U2, 0x10
257-
258- slli.d T2, INCX, 3
259- add .d IX, IX, T2
293+ alsl.d IX, INCX, IX, 3
260294
261295 addi.d II, II, 64
262296 addi.d I, I, 1
263297 blt I, T0, .L02
264298
299+ // Acc U2
300+ GACC vf, d, U4, U2
301+ vilvl.d U2, U4, U4
302+
265303.L03: /* &4 */
266304 sub .d T0, M, J
267305 addi.d T0, T0, -1
@@ -429,4 +467,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
429467 addi.d $sp, $sp, 88
430468 jirl $r0, $r1, 0x0
431469
432- EPILOGUE
470+ EPILOGUE
0 commit comments