@@ -206,59 +206,46 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
206206;
207207; CHECK-NEWLOWERING-LABEL: udot_8to64:
208208; CHECK-NEWLOWERING: // %bb.0: // %entry
209- ; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
210- ; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
211- ; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
212- ; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
213- ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
214- ; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
215- ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
216- ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
217- ; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
218- ; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
219- ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
209+ ; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z3.b
210+ ; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z2.b
220211; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
212+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
221213; CHECK-NEWLOWERING-NEXT: ptrue p0.d
222214; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
223- ; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
224215; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
216+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
225217; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
226- ; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
227- ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
228- ; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
229- ; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
230- ; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s
231- ; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s
232- ; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s
233- ; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s
218+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z24.s, z3.h
219+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z25.s, z2.h
220+ ; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z3.h
221+ ; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h
222+ ; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
223+ ; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
224+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
225+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
226+ ; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z4.s
234227; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s
235228; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
236- ; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
237229; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
238- ; CHECK-NEWLOWERING-NEXT: uunpkhi z30 .d, z24.s
239- ; CHECK-NEWLOWERING-NEXT: uunpkhi z31 .d, z2 .s
240- ; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s
241- ; CHECK-NEWLOWERING-NEXT: uunpklo z2 .d, z2.s
242- ; CHECK-NEWLOWERING-NEXT: uunpkhi z8 .d, z25.s
243- ; CHECK-NEWLOWERING-NEXT: uunpklo z25 .d, z25 .s
244- ; CHECK-NEWLOWERING-NEXT: uunpklo z9 .d, z3.s
245- ; CHECK-NEWLOWERING-NEXT: mul z27 .d, z27.d, z29.d
246- ; CHECK-NEWLOWERING-NEXT: mla z0 .d, p0/m, z6.d, z28.d
230+ ; CHECK-NEWLOWERING-NEXT: mla z0 .d, p0/m, z27.d, z26.d
231+ ; CHECK-NEWLOWERING-NEXT: uunpklo z26 .d, z24 .s
232+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
233+ ; CHECK-NEWLOWERING-NEXT: mla z1 .d, p0/m, z7.d, z6.d
234+ ; CHECK-NEWLOWERING-NEXT: uunpklo z6 .d, z25.s
235+ ; CHECK-NEWLOWERING-NEXT: uunpklo z7 .d, z3 .s
236+ ; CHECK-NEWLOWERING-NEXT: mul z27 .d, z29.d, z28.d
237+ ; CHECK-NEWLOWERING-NEXT: uunpklo z28 .d, z2.s
238+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z25 .d, z25.s
247239; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
248- ; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
249- ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
250- ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
251- ; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
252- ; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
253- ; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
254- ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
255- ; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
256- ; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
257- ; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
258- ; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
259- ; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
260- ; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
261- ; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
240+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
241+ ; CHECK-NEWLOWERING-NEXT: mul z4.d, z5.d, z4.d
242+ ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z26.d
243+ ; CHECK-NEWLOWERING-NEXT: movprfx z5, z27
244+ ; CHECK-NEWLOWERING-NEXT: mla z5.d, p0/m, z28.d, z7.d
245+ ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
246+ ; CHECK-NEWLOWERING-NEXT: mad z2.d, p0/m, z3.d, z4.d
247+ ; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
248+ ; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
262249; CHECK-NEWLOWERING-NEXT: ret
263250entry:
264251 %a.wide = zext <vscale x 16 x i8 > %a to <vscale x 16 x i64 >
@@ -282,59 +269,46 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
282269;
283270; CHECK-NEWLOWERING-LABEL: sdot_8to64:
284271; CHECK-NEWLOWERING: // %bb.0: // %entry
285- ; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
286- ; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
287- ; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
288- ; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
289- ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
290- ; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
291- ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
292- ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
293- ; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
294- ; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
295- ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
272+ ; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z3.b
273+ ; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z2.b
296274; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
275+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
297276; CHECK-NEWLOWERING-NEXT: ptrue p0.d
298277; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
299- ; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
300278; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
279+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
301280; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
302- ; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
303- ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
304- ; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
305- ; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
306- ; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s
307- ; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s
308- ; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s
309- ; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s
281+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z24.s, z3.h
282+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z25.s, z2.h
283+ ; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z3.h
284+ ; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h
285+ ; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
286+ ; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
287+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
288+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
289+ ; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z4.s
310290; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s
311291; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
312- ; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
313292; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
314- ; CHECK-NEWLOWERING-NEXT: sunpkhi z30 .d, z24.s
315- ; CHECK-NEWLOWERING-NEXT: sunpkhi z31 .d, z2 .s
316- ; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s
317- ; CHECK-NEWLOWERING-NEXT: sunpklo z2 .d, z2.s
318- ; CHECK-NEWLOWERING-NEXT: sunpkhi z8 .d, z25.s
319- ; CHECK-NEWLOWERING-NEXT: sunpklo z25 .d, z25 .s
320- ; CHECK-NEWLOWERING-NEXT: sunpklo z9 .d, z3.s
321- ; CHECK-NEWLOWERING-NEXT: mul z27 .d, z27.d, z29.d
322- ; CHECK-NEWLOWERING-NEXT: mla z0 .d, p0/m, z6.d, z28.d
293+ ; CHECK-NEWLOWERING-NEXT: mla z0 .d, p0/m, z27.d, z26.d
294+ ; CHECK-NEWLOWERING-NEXT: sunpklo z26 .d, z24 .s
295+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
296+ ; CHECK-NEWLOWERING-NEXT: mla z1 .d, p0/m, z7.d, z6.d
297+ ; CHECK-NEWLOWERING-NEXT: sunpklo z6 .d, z25.s
298+ ; CHECK-NEWLOWERING-NEXT: sunpklo z7 .d, z3 .s
299+ ; CHECK-NEWLOWERING-NEXT: mul z27 .d, z29.d, z28.d
300+ ; CHECK-NEWLOWERING-NEXT: sunpklo z28 .d, z2.s
301+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z25 .d, z25.s
323302; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
324- ; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
325- ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
326- ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
327- ; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
328- ; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
329- ; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
330- ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
331- ; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
332- ; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
333- ; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
334- ; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
335- ; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
336- ; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
337- ; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
303+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
304+ ; CHECK-NEWLOWERING-NEXT: mul z4.d, z5.d, z4.d
305+ ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z26.d
306+ ; CHECK-NEWLOWERING-NEXT: movprfx z5, z27
307+ ; CHECK-NEWLOWERING-NEXT: mla z5.d, p0/m, z28.d, z7.d
308+ ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
309+ ; CHECK-NEWLOWERING-NEXT: mad z2.d, p0/m, z3.d, z4.d
310+ ; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
311+ ; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
338312; CHECK-NEWLOWERING-NEXT: ret
339313entry:
340314 %a.wide = sext <vscale x 16 x i8 > %a to <vscale x 16 x i64 >
@@ -816,11 +790,11 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %
816790; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
817791; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
818792; CHECK-NEWLOWERING-NEXT: ptrue p0.s
819- ; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h
820- ; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
821- ; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
793+ ; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h
794+ ; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z1.h
822795; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
823- ; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s
796+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
797+ ; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z4.s, z3.s
824798; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
825799; CHECK-NEWLOWERING-NEXT: ret
826800entry:
@@ -850,11 +824,11 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x
850824; CHECK-NEWLOWERING-NEXT: and z1.s, z1.s, #0xffff
851825; CHECK-NEWLOWERING-NEXT: and z2.s, z2.s, #0xffff
852826; CHECK-NEWLOWERING-NEXT: ptrue p0.d
853- ; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z1.s
854- ; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z2.s
855- ; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
827+ ; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s
828+ ; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s
856829; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
857- ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
830+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
831+ ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z3.d
858832; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
859833; CHECK-NEWLOWERING-NEXT: ret
860834entry:
@@ -1221,10 +1195,8 @@ define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
12211195;
12221196; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote:
12231197; CHECK-NEWLOWERING: // %bb.0: // %entry
1224- ; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
12251198; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
1226- ; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h
1227- ; CHECK-NEWLOWERING-NEXT: mov z2.h, #1 // =0x1
1199+ ; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
12281200; CHECK-NEWLOWERING-NEXT: udot z0.d, z1.h, z2.h
12291201; CHECK-NEWLOWERING-NEXT: ret
12301202entry:
@@ -1257,11 +1229,9 @@ define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
12571229; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote:
12581230; CHECK-NEWLOWERING: // %bb.0: // %entry
12591231; CHECK-NEWLOWERING-NEXT: ptrue p0.h
1260- ; CHECK-NEWLOWERING-NEXT: sxtb z1.h, p0/m, z1.h
12611232; CHECK-NEWLOWERING-NEXT: sxtb z2.h, p0/m, z2.h
1262- ; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h
1263- ; CHECK-NEWLOWERING-NEXT: mov z2.h, #1 // =0x1
1264- ; CHECK-NEWLOWERING-NEXT: udot z0.d, z1.h, z2.h
1233+ ; CHECK-NEWLOWERING-NEXT: sxtb z1.h, p0/m, z1.h
1234+ ; CHECK-NEWLOWERING-NEXT: sdot z0.d, z1.h, z2.h
12651235; CHECK-NEWLOWERING-NEXT: ret
12661236entry:
12671237 %a.wide = sext <vscale x 8 x i8 > %a to <vscale x 8 x i16 >
0 commit comments