@@ -206,46 +206,59 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
206206;
207207; CHECK-NEWLOWERING-LABEL: udot_8to64:
208208; CHECK-NEWLOWERING: // %bb.0: // %entry
209- ; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z3.b
210- ; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z2.b
211- ; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
209+ ; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
210+ ; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
211+ ; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
212+ ; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
213+ ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
214+ ; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
215+ ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
216+ ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
217+ ; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
218+ ; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
212219; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
220+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
213221; CHECK-NEWLOWERING-NEXT: ptrue p0.d
214222; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
215- ; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
216223; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
224+ ; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
217225; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
218- ; CHECK-NEWLOWERING-NEXT: uunpkhi z24.s, z3.h
219- ; CHECK-NEWLOWERING-NEXT: uunpkhi z25.s, z2.h
220- ; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z3.h
221- ; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h
222- ; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
223- ; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
224- ; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
225- ; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
226- ; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z4.s
226+ ; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
227+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
228+ ; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
229+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
230+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s
231+ ; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s
232+ ; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s
233+ ; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s
227234; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s
228235; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
236+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
229237; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
230- ; CHECK-NEWLOWERING-NEXT: mla z0 .d, p0/m, z27.d, z26.d
231- ; CHECK-NEWLOWERING-NEXT: uunpklo z26 .d, z24 .s
232- ; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
233- ; CHECK-NEWLOWERING-NEXT: mla z1 .d, p0/m, z7.d, z6.d
234- ; CHECK-NEWLOWERING-NEXT: uunpklo z6 .d, z25.s
235- ; CHECK-NEWLOWERING-NEXT: uunpklo z7 .d, z3 .s
236- ; CHECK-NEWLOWERING-NEXT: mul z27 .d, z29.d, z28.d
237- ; CHECK-NEWLOWERING-NEXT: uunpklo z28 .d, z2.s
238- ; CHECK-NEWLOWERING-NEXT: uunpkhi z25 .d, z25.s
238+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z30 .d, z24.s
239+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z31 .d, z2 .s
240+ ; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s
241+ ; CHECK-NEWLOWERING-NEXT: uunpklo z2 .d, z2.s
242+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z8 .d, z25.s
243+ ; CHECK-NEWLOWERING-NEXT: uunpklo z25 .d, z25 .s
244+ ; CHECK-NEWLOWERING-NEXT: uunpklo z9 .d, z3.s
245+ ; CHECK-NEWLOWERING-NEXT: mul z27 .d, z27.d, z29.d
246+ ; CHECK-NEWLOWERING-NEXT: mla z0 .d, p0/m, z6.d, z28.d
239247; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
240- ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
241- ; CHECK-NEWLOWERING-NEXT: mul z4.d, z5.d, z4.d
242- ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z26.d
243- ; CHECK-NEWLOWERING-NEXT: movprfx z5, z27
244- ; CHECK-NEWLOWERING-NEXT: mla z5.d, p0/m, z28.d, z7.d
245- ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
246- ; CHECK-NEWLOWERING-NEXT: mad z2.d, p0/m, z3.d, z4.d
247- ; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
248- ; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
248+ ; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
249+ ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
250+ ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
251+ ; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
252+ ; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
253+ ; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
254+ ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
255+ ; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
256+ ; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
257+ ; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
258+ ; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
259+ ; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
260+ ; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
261+ ; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
249262; CHECK-NEWLOWERING-NEXT: ret
250263entry:
251264 %a.wide = zext <vscale x 16 x i8 > %a to <vscale x 16 x i64 >
@@ -269,46 +282,59 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
269282;
270283; CHECK-NEWLOWERING-LABEL: sdot_8to64:
271284; CHECK-NEWLOWERING: // %bb.0: // %entry
272- ; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z3.b
273- ; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z2.b
274- ; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
285+ ; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
286+ ; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
287+ ; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
288+ ; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
289+ ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
290+ ; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
291+ ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
292+ ; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
293+ ; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
294+ ; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
275295; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
296+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
276297; CHECK-NEWLOWERING-NEXT: ptrue p0.d
277298; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
278- ; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
279299; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
300+ ; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
280301; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
281- ; CHECK-NEWLOWERING-NEXT: sunpkhi z24.s, z3.h
282- ; CHECK-NEWLOWERING-NEXT: sunpkhi z25.s, z2.h
283- ; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z3.h
284- ; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h
285- ; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
286- ; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
287- ; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
288- ; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
289- ; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z4.s
302+ ; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
303+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
304+ ; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
305+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
306+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s
307+ ; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s
308+ ; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s
309+ ; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s
290310; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s
291311; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
312+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
292313; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
293- ; CHECK-NEWLOWERING-NEXT: mla z0 .d, p0/m, z27.d, z26.d
294- ; CHECK-NEWLOWERING-NEXT: sunpklo z26 .d, z24 .s
295- ; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
296- ; CHECK-NEWLOWERING-NEXT: mla z1 .d, p0/m, z7.d, z6.d
297- ; CHECK-NEWLOWERING-NEXT: sunpklo z6 .d, z25.s
298- ; CHECK-NEWLOWERING-NEXT: sunpklo z7 .d, z3 .s
299- ; CHECK-NEWLOWERING-NEXT: mul z27 .d, z29.d, z28.d
300- ; CHECK-NEWLOWERING-NEXT: sunpklo z28 .d, z2.s
301- ; CHECK-NEWLOWERING-NEXT: sunpkhi z25 .d, z25.s
314+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z30 .d, z24.s
315+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z31 .d, z2 .s
316+ ; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s
317+ ; CHECK-NEWLOWERING-NEXT: sunpklo z2 .d, z2.s
318+ ; CHECK-NEWLOWERING-NEXT: sunpkhi z8 .d, z25.s
319+ ; CHECK-NEWLOWERING-NEXT: sunpklo z25 .d, z25 .s
320+ ; CHECK-NEWLOWERING-NEXT: sunpklo z9 .d, z3.s
321+ ; CHECK-NEWLOWERING-NEXT: mul z27 .d, z27.d, z29.d
322+ ; CHECK-NEWLOWERING-NEXT: mla z0 .d, p0/m, z6.d, z28.d
302323; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
303- ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
304- ; CHECK-NEWLOWERING-NEXT: mul z4.d, z5.d, z4.d
305- ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z26.d
306- ; CHECK-NEWLOWERING-NEXT: movprfx z5, z27
307- ; CHECK-NEWLOWERING-NEXT: mla z5.d, p0/m, z28.d, z7.d
308- ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
309- ; CHECK-NEWLOWERING-NEXT: mad z2.d, p0/m, z3.d, z4.d
310- ; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
311- ; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
324+ ; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
325+ ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
326+ ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
327+ ; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
328+ ; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
329+ ; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
330+ ; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
331+ ; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
332+ ; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
333+ ; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
334+ ; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
335+ ; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
336+ ; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
337+ ; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
312338; CHECK-NEWLOWERING-NEXT: ret
313339entry:
314340 %a.wide = sext <vscale x 16 x i8 > %a to <vscale x 16 x i64 >
@@ -790,11 +816,11 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %
790816; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
791817; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
792818; CHECK-NEWLOWERING-NEXT: ptrue p0.s
793- ; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h
794- ; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z1.h
795- ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
819+ ; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h
820+ ; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
796821; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
797- ; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z4.s, z3.s
822+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
823+ ; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s
798824; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
799825; CHECK-NEWLOWERING-NEXT: ret
800826entry:
@@ -824,11 +850,11 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x
824850; CHECK-NEWLOWERING-NEXT: and z1.s, z1.s, #0xffff
825851; CHECK-NEWLOWERING-NEXT: and z2.s, z2.s, #0xffff
826852; CHECK-NEWLOWERING-NEXT: ptrue p0.d
827- ; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s
828- ; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s
829- ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
853+ ; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z1.s
854+ ; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z2.s
830855; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
831- ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z3.d
856+ ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
857+ ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
832858; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
833859; CHECK-NEWLOWERING-NEXT: ret
834860entry:
0 commit comments