From f9d4ac8dfd6beb92ba38cd6a0e18ec3266c2bd62 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Wed, 15 Jan 2025 12:01:06 +0000 Subject: [PATCH 1/7] Add uses of %scalable_arg in SME2 dot intrinsics tests --- .../AArch64/sme2-intrinsics-int-dots.ll | 392 +++++++++++------- .../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 358 ++++++++++------ 2 files changed, 480 insertions(+), 270 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index 967d168593a40..272bf6f159866 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -1074,15 +1074,21 @@ define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] -; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: mov z2.d, z9.d +; CHECK-NEXT: mov z3.d, z10.d +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1097,6 +1103,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1154,38 +1161,58 @@ define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: udot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-9 -; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: addvl sp, sp, #-12 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov z16.d, z1.d +; CHECK-NEXT: mov z17.d, z2.d +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z4.d, z5.d +; CHECK-NEXT: mov z5.d, z6.d +; CHECK-NEXT: mov z6.d, z7.d +; CHECK-NEXT: mov z7.d, z20.d +; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z9.d +; CHECK-NEXT: mov z5.d, z10.d +; CHECK-NEXT: mov z6.d, z11.d +; CHECK-NEXT: mov z7.d, z24.d ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z4.d, z13.d +; CHECK-NEXT: mov z5.d, z14.d +; CHECK-NEXT: mov z6.d, z15.d +; CHECK-NEXT: mov z7.d, z28.d +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: addvl sp, sp, #12 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -1219,6 +1246,7 @@ entry: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1322,15 +1350,21 @@ define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] -; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: mov z2.d, z9.d +; CHECK-NEXT: mov z3.d, z10.d +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1345,6 +1379,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1402,38 +1437,58 @@ define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: usdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-9 -; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: addvl sp, sp, #-12 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov z16.d, z1.d +; CHECK-NEXT: mov z17.d, z2.d +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z4.d, z5.d +; CHECK-NEXT: mov z5.d, z6.d +; CHECK-NEXT: mov z6.d, z7.d +; CHECK-NEXT: mov z7.d, z20.d +; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z9.d +; CHECK-NEXT: mov z5.d, z10.d +; CHECK-NEXT: mov z6.d, z11.d +; CHECK-NEXT: mov z7.d, z24.d ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z4.d, z13.d +; CHECK-NEXT: mov z5.d, z14.d +; CHECK-NEXT: mov z6.d, z15.d +; CHECK-NEXT: mov z7.d, z28.d +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: addvl sp, sp, #12 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -1467,6 +1522,7 @@ entry: tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1572,15 +1628,21 @@ define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] -; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: mov z2.d, z9.d +; CHECK-NEXT: mov z3.d, z10.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1595,6 +1657,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1652,38 +1715,58 @@ define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: sdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-9 -; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: addvl sp, sp, #-12 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov z16.d, z1.d +; CHECK-NEXT: mov z17.d, z2.d +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z4.d, z5.d +; CHECK-NEXT: mov z5.d, z6.d +; CHECK-NEXT: mov z6.d, z7.d +; CHECK-NEXT: mov z7.d, z20.d +; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z9.d +; CHECK-NEXT: mov z5.d, z10.d +; CHECK-NEXT: mov z6.d, z11.d +; CHECK-NEXT: mov z7.d, z24.d ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z4.d, z13.d +; CHECK-NEXT: mov z5.d, z14.d +; CHECK-NEXT: mov z6.d, z15.d +; CHECK-NEXT: mov z7.d, z28.d +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: addvl sp, sp, #12 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -1717,6 +1800,7 @@ entry: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1822,15 +1906,21 @@ define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] -; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: mov z2.d, z9.d +; CHECK-NEXT: mov z3.d, z10.d +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1845,6 +1935,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1902,38 +1993,58 @@ define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: sudot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-9 -; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: addvl sp, sp, #-12 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov z16.d, z1.d +; CHECK-NEXT: mov z17.d, z2.d +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z4.d, z5.d +; CHECK-NEXT: mov z5.d, z6.d +; CHECK-NEXT: mov z6.d, z7.d +; CHECK-NEXT: mov z7.d, z20.d +; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z9.d +; CHECK-NEXT: mov z5.d, z10.d +; CHECK-NEXT: mov z6.d, z11.d +; CHECK-NEXT: mov z7.d, z24.d ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z4.d, z13.d +; CHECK-NEXT: mov z5.d, z14.d +; CHECK-NEXT: mov z6.d, z15.d +; CHECK-NEXT: mov z7.d, z28.d +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: addvl sp, sp, #12 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -1967,6 +2078,7 @@ entry: tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll index e7d1050b60799..def924b9631de 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll @@ -99,7 +99,7 @@ entry: ret void } -define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: svdot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill @@ -107,16 +107,22 @@ define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] -; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9] -; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] -; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x9] +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] +; CHECK-NEXT: mov z2.d, z9.d +; CHECK-NEXT: mov z3.d, z10.d +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -131,6 +137,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -184,42 +191,62 @@ entry: ret void } -define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: svdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-9 -; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: addvl sp, sp, #-12 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov z16.d, z1.d +; CHECK-NEXT: mov z17.d, z2.d +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z4.d, z5.d +; CHECK-NEXT: mov z5.d, z6.d +; CHECK-NEXT: mov z6.d, z7.d +; CHECK-NEXT: mov z7.d, z20.d +; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z9.d +; CHECK-NEXT: mov z5.d, z10.d +; CHECK-NEXT: mov z6.d, z11.d +; CHECK-NEXT: mov z7.d, z24.d +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z13.d +; CHECK-NEXT: mov z5.d, z14.d +; CHECK-NEXT: mov z6.d, z15.d +; CHECK-NEXT: mov z7.d, z28.d ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: addvl sp, sp, #12 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -253,6 +280,7 @@ entry: tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -322,7 +350,7 @@ entry: ret void } -define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: uvdot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill @@ -330,16 +358,22 @@ define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] -; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x9] +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] +; CHECK-NEXT: mov z2.d, z9.d +; CHECK-NEXT: mov z3.d, z10.d +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -354,6 +388,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -407,42 +442,62 @@ entry: ret void } -define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: uvdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-9 -; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: addvl sp, sp, #-12 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov z16.d, z1.d +; CHECK-NEXT: mov z17.d, z2.d +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z4.d, z5.d +; CHECK-NEXT: mov z5.d, z6.d +; CHECK-NEXT: mov z6.d, z7.d +; CHECK-NEXT: mov z7.d, z20.d +; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z9.d +; CHECK-NEXT: mov z5.d, z10.d +; CHECK-NEXT: mov z6.d, z11.d +; CHECK-NEXT: mov z7.d, z24.d +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z13.d +; CHECK-NEXT: mov z5.d, z14.d +; CHECK-NEXT: mov z6.d, z15.d +; CHECK-NEXT: mov z7.d, z28.d ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: addvl sp, sp, #12 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -476,6 +531,7 @@ entry: tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -544,42 +600,62 @@ entry: ret void } -define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: suvdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-9 -; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: addvl sp, sp, #-12 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov z16.d, z1.d +; CHECK-NEXT: mov z17.d, z2.d +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z4.d, z5.d +; CHECK-NEXT: mov z5.d, z6.d +; CHECK-NEXT: mov z6.d, z7.d +; CHECK-NEXT: mov z7.d, z20.d +; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z9.d +; CHECK-NEXT: mov z5.d, z10.d +; CHECK-NEXT: mov z6.d, z11.d +; CHECK-NEXT: mov z7.d, z24.d +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z13.d +; CHECK-NEXT: mov z5.d, z14.d +; CHECK-NEXT: mov z6.d, z15.d +; CHECK-NEXT: mov z7.d, z28.d ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: addvl sp, sp, #12 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -613,6 +689,7 @@ entry: tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -681,42 +758,62 @@ entry: ret void } -define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: usvdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-9 -; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: addvl sp, sp, #-12 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: mov z16.d, z1.d +; CHECK-NEXT: mov z17.d, z2.d +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z4.d, z5.d +; CHECK-NEXT: mov z5.d, z6.d +; CHECK-NEXT: mov z6.d, z7.d +; CHECK-NEXT: mov z7.d, z20.d +; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z9.d +; CHECK-NEXT: mov z5.d, z10.d +; CHECK-NEXT: mov z6.d, z11.d +; CHECK-NEXT: mov z7.d, z24.d +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] +; CHECK-NEXT: mov z4.d, z13.d +; CHECK-NEXT: mov z5.d, z14.d +; CHECK-NEXT: mov z6.d, z15.d +; CHECK-NEXT: mov z7.d, z28.d ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: addvl sp, sp, #12 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -750,6 +847,7 @@ entry: tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } From 68e65dbe6ecef86e2c8fcb066624638b35dd75bd Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Wed, 15 Jan 2025 13:31:22 +0000 Subject: [PATCH 2/7] [AArch64][SME] Make getRegAllocationHints stricter for multi-vector loads. getRegAllocationHints looks for ZPR2StridedOrContiguous load instructions which are used by FORM_TRANSPOSED_REG_TUPLE pseudos and adds all strided registers from this class to the list of hints. This patch changes getRegAllocationHints to restrict this list: - If the pseudo uses ZPRMul class, the first load must begin with a register which is a multiple of 2 or 4. - Only add a strided register to the list if it does not already have any live intervals. --- .../Target/AArch64/AArch64RegisterInfo.cpp | 79 +++- .../AArch64/sme2-intrinsics-int-dots.ll | 360 +++++++----------- .../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 324 ++++++---------- .../AArch64/sme2-multivec-regalloc.mir | 183 +++++++++ 4 files changed, 511 insertions(+), 435 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme2-multivec-regalloc.mir diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 5973b63b5a802..aac1dc9cb5c06 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -20,6 +20,7 @@ #include "MCTargetDesc/AArch64InstPrinter.h" #include "llvm/ADT/BitVector.h" #include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -1107,23 +1108,83 @@ bool AArch64RegisterInfo::getRegAllocationHints( // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy // instructions over reducing the number of clobbered callee-save registers, // so we add the strided registers as a hint. + const MachineInstr *TupleInst = nullptr; unsigned RegID = MRI.getRegClass(VirtReg)->getID(); // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE. if ((RegID == AArch64::ZPR2StridedOrContiguousRegClassID || RegID == AArch64::ZPR4StridedOrContiguousRegClassID) && - any_of(MRI.use_nodbg_instructions(VirtReg), [](const MachineInstr &Use) { - return Use.getOpcode() == - AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || - Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO; + any_of(MRI.use_nodbg_instructions(VirtReg), [&TupleInst]( + const MachineInstr &Use) { + bool IsTuple = + Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || + Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO; + TupleInst = &Use; + return IsTuple; })) { - const TargetRegisterClass *StridedRC = - RegID == AArch64::ZPR2StridedOrContiguousRegClassID - ? &AArch64::ZPR2StridedRegClass - : &AArch64::ZPR4StridedRegClass; + unsigned LdOps = TupleInst->getNumOperands() - 1; + const TargetRegisterClass *StridedRC = LdOps == 2 + ? &AArch64::ZPR2StridedRegClass + : &AArch64::ZPR4StridedRegClass; + SmallVector StridedOrder; for (MCPhysReg Reg : Order) if (StridedRC->contains(Reg)) - Hints.push_back(Reg); + StridedOrder.push_back(Reg); + + int OpIdx = TupleInst->findRegisterUseOperandIdx(VirtReg, this); + if (OpIdx == -1) + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, + MF, VRM); + + unsigned TupleID = + MRI.getRegClass(TupleInst->getOperand(0).getReg())->getID(); + bool IsMulZPR = TupleID == AArch64::ZPR2Mul2RegClassID || + TupleID == AArch64::ZPR4Mul4RegClassID; + + if (OpIdx == 1) { + for (unsigned I = 0; I < StridedOrder.size(); ++I) { + MCPhysReg Reg = StridedOrder[I]; + unsigned FirstReg = getSubReg(Reg, AArch64::zsub0); + + // If the FORM_TRANSPOSE nodes use the ZPRMul classes, the starting + // register of the first load should be a multiple of 2 or 4. + if (IsMulZPR && + (getSubReg(Reg, AArch64::zsub0) - AArch64::Z0) % LdOps != 0) + continue; + // Skip this register if it has any live intervals assigned. + if (Matrix->isPhysRegUsed(Reg)) + continue; + + bool CanAssign = true; + for (unsigned Next = 1; Next < LdOps; ++Next) { + // Ensure we can assign enough registers from the list for all loads. + if (I + Next >= StridedOrder.size()) { + CanAssign = false; + break; + } + // Ensure the subsequent registers are not live and that the starting + // sub-registers are sequential. + MCPhysReg NextReg = StridedOrder[I + Next]; + if (Matrix->isPhysRegUsed(NextReg) || + (getSubReg(NextReg, AArch64::zsub0) != FirstReg + Next)) { + CanAssign = false; + break; + } + } + if (CanAssign) + Hints.push_back(Reg); + } + } else if (VRM->hasPhys(TupleInst->getOperand(1).getReg())) { + // This is not the first load in the sequence. Find the register + // assigned to the first and match to a strided reg in the list. + MCPhysReg FirstLoadPhysReg = + VRM->getPhys(TupleInst->getOperand(1).getReg()); + for (unsigned I = 0; I < StridedOrder.size(); ++I) { + if (StridedOrder[I] == FirstLoadPhysReg && + (I + (OpIdx - 1) < StridedOrder.size())) + Hints.push_back(StridedOrder[I + (OpIdx - 1)]); + } + } return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM); diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index 272bf6f159866..db177fac3b265 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -1074,19 +1074,15 @@ define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] -; CHECK-NEXT: mov z2.d, z9.d -; CHECK-NEXT: mov z3.d, z10.d -; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] -; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 @@ -1161,58 +1157,40 @@ define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: udot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-12 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 -; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] -; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: mov z16.d, z1.d -; CHECK-NEXT: mov z17.d, z2.d -; CHECK-NEXT: mov z18.d, z3.d -; CHECK-NEXT: mov z4.d, z5.d -; CHECK-NEXT: mov z5.d, z6.d -; CHECK-NEXT: mov z6.d, z7.d -; CHECK-NEXT: mov z7.d, z20.d -; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z9.d -; CHECK-NEXT: mov z5.d, z10.d -; CHECK-NEXT: mov z6.d, z11.d -; CHECK-NEXT: mov z7.d, z24.d -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z13.d -; CHECK-NEXT: mov z5.d, z14.d -; CHECK-NEXT: mov z6.d, z15.d -; CHECK-NEXT: mov z7.d, z28.d -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x0] -; CHECK-NEXT: addvl sp, sp, #12 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -1350,19 +1328,15 @@ define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] ; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] -; CHECK-NEXT: mov z2.d, z9.d -; CHECK-NEXT: mov z3.d, z10.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] -; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 @@ -1437,58 +1411,40 @@ define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: usdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-12 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 -; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] -; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: mov z16.d, z1.d -; CHECK-NEXT: mov z17.d, z2.d -; CHECK-NEXT: mov z18.d, z3.d -; CHECK-NEXT: mov z4.d, z5.d -; CHECK-NEXT: mov z5.d, z6.d -; CHECK-NEXT: mov z6.d, z7.d -; CHECK-NEXT: mov z7.d, z20.d -; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z9.d -; CHECK-NEXT: mov z5.d, z10.d -; CHECK-NEXT: mov z6.d, z11.d -; CHECK-NEXT: mov z7.d, z24.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z13.d -; CHECK-NEXT: mov z5.d, z14.d -; CHECK-NEXT: mov z6.d, z15.d -; CHECK-NEXT: mov z7.d, z28.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x0] -; CHECK-NEXT: addvl sp, sp, #12 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -1628,19 +1584,15 @@ define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] ; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] -; CHECK-NEXT: mov z2.d, z9.d -; CHECK-NEXT: mov z3.d, z10.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] -; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 @@ -1715,58 +1667,40 @@ define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: sdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-12 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 -; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] -; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: mov z16.d, z1.d -; CHECK-NEXT: mov z17.d, z2.d -; CHECK-NEXT: mov z18.d, z3.d -; CHECK-NEXT: mov z4.d, z5.d -; CHECK-NEXT: mov z5.d, z6.d -; CHECK-NEXT: mov z6.d, z7.d -; CHECK-NEXT: mov z7.d, z20.d -; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z9.d -; CHECK-NEXT: mov z5.d, z10.d -; CHECK-NEXT: mov z6.d, z11.d -; CHECK-NEXT: mov z7.d, z24.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z13.d -; CHECK-NEXT: mov z5.d, z14.d -; CHECK-NEXT: mov z6.d, z15.d -; CHECK-NEXT: mov z7.d, z28.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x0] -; CHECK-NEXT: addvl sp, sp, #12 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -1906,19 +1840,15 @@ define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] ; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] -; CHECK-NEXT: mov z2.d, z9.d -; CHECK-NEXT: mov z3.d, z10.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] -; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 @@ -1993,58 +1923,40 @@ define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: sudot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-12 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 -; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] -; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: mov z16.d, z1.d -; CHECK-NEXT: mov z17.d, z2.d -; CHECK-NEXT: mov z18.d, z3.d -; CHECK-NEXT: mov z4.d, z5.d -; CHECK-NEXT: mov z5.d, z6.d -; CHECK-NEXT: mov z6.d, z7.d -; CHECK-NEXT: mov z7.d, z20.d -; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] ; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z9.d -; CHECK-NEXT: mov z5.d, z10.d -; CHECK-NEXT: mov z6.d, z11.d -; CHECK-NEXT: mov z7.d, z24.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z13.d -; CHECK-NEXT: mov z5.d, z14.d -; CHECK-NEXT: mov z6.d, z15.d -; CHECK-NEXT: mov z7.d, z28.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x0] -; CHECK-NEXT: addvl sp, sp, #12 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll index def924b9631de..63851dd857f97 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll @@ -107,20 +107,16 @@ define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 -; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x0] -; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x9] -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z3.h, z11.h }, pn8/z, [x9] ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] -; CHECK-NEXT: mov z2.d, z9.d -; CHECK-NEXT: mov z3.d, z10.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] -; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z10.h, z11.h }, z0.h[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 @@ -195,58 +191,40 @@ define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: svdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-12 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 -; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] -; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: mov z16.d, z1.d -; CHECK-NEXT: mov z17.d, z2.d -; CHECK-NEXT: mov z18.d, z3.d -; CHECK-NEXT: mov z4.d, z5.d -; CHECK-NEXT: mov z5.d, z6.d -; CHECK-NEXT: mov z6.d, z7.d -; CHECK-NEXT: mov z7.d, z20.d -; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z9.d -; CHECK-NEXT: mov z5.d, z10.d -; CHECK-NEXT: mov z6.d, z11.d -; CHECK-NEXT: mov z7.d, z24.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z13.d -; CHECK-NEXT: mov z5.d, z14.d -; CHECK-NEXT: mov z6.d, z15.d -; CHECK-NEXT: mov z7.d, z28.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x0] -; CHECK-NEXT: addvl sp, sp, #12 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -358,20 +336,16 @@ define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 -; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x0] -; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x9] -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z3.h, z11.h }, pn8/z, [x9] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] -; CHECK-NEXT: mov z2.d, z9.d -; CHECK-NEXT: mov z3.d, z10.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] -; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z10.h, z11.h }, z0.h[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 @@ -446,58 +420,40 @@ define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-LABEL: uvdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-12 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: addvl sp, sp, #-9 ; CHECK-NEXT: lsl x9, x1, #1 -; CHECK-NEXT: str z20, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: str z13, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z11, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z10, [sp, #10, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: str z9, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10] -; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: mov z16.d, z1.d -; CHECK-NEXT: mov z17.d, z2.d -; CHECK-NEXT: mov z18.d, z3.d -; CHECK-NEXT: mov z4.d, z5.d -; CHECK-NEXT: mov z5.d, z6.d -; CHECK-NEXT: mov z6.d, z7.d -; CHECK-NEXT: mov z7.d, z20.d -; CHECK-NEXT: mov z19.d, z16.d +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z9.d -; CHECK-NEXT: mov z5.d, z10.d -; CHECK-NEXT: mov z6.d, z11.d -; CHECK-NEXT: mov z7.d, z24.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: mov z4.d, z13.d -; CHECK-NEXT: mov z5.d, z14.d -; CHECK-NEXT: mov z6.d, z15.d -; CHECK-NEXT: mov z7.d, z28.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x0] -; CHECK-NEXT: addvl sp, sp, #12 +; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -604,58 +560,40 @@ define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { + entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %12 = extractvalue { , , , } %11, 0 + %13 = extractvalue { , , , } %11, 1 + %14 = extractvalue { , , , } %11, 2 + %15 = extractvalue { , , , } %11, 3 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %17 = extractvalue { , , , } %16, 0 + %18 = extractvalue { , , , } %16, 1 + %19 = extractvalue { , , , } %16, 2 + %20 = extractvalue { , , , } %16, 3 + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) + tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr, align 16 + ret void + } + + ; Function Attrs: nocallback nofree nosync nounwind willreturn + declare void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32, , , , , , i32 immarg) #1 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) + declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr) #2 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() #3 + + attributes #0 = { nounwind "target-features"="+sme2" } + attributes #1 = { nocallback nofree nosync nounwind willreturn } + attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } + attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) } + +... +--- +name: form_4x_tuple_many_live +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: true +isSSA: false +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +liveins: + - { reg: '$x0', virtual-reg: '%0' } + - { reg: '$x1', virtual-reg: '%1' } + - { reg: '$z0', virtual-reg: '%2' } + - { reg: '$z17', virtual-reg: '%3' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $x0, $x1, $z0, $z17 + + ; CHECK-LABEL: form_4x_tuple_many_live + ; CHECK: stp d11, d10, [sp, #-32]! + ; CHECK-NEXT: lsl x9, x1, #1 + ; CHECK-NEXT: stp d9, d8, [sp, #16] + ; CHECK-NEXT: ptrue pn8.b + ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] + ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x1] + ; CHECK-NEXT: mov w8, wzr + ; CHECK-NEXT: add x10, x9, x1 + ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x9] + ; CHECK-NEXT: ptrue p0.b + ; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x10] + ; CHECK-NEXT: mov z8.d, z16.d + ; CHECK-NEXT: mov z9.d, z18.d + ; CHECK-NEXT: mov z21.d, z22.d + ; CHECK-NEXT: mov z10.d, z19.d + ; CHECK-NEXT: mov z22.d, z23.d + ; CHECK-NEXT: mov z25.d, z26.d + ; CHECK-NEXT: mov z11.d, z4.d + ; CHECK-NEXT: mov z23.d, z5.d + ; CHECK-NEXT: mov z26.d, z27.d + ; CHECK-NEXT: mov z27.d, z6.d + ; CHECK-NEXT: mov z29.d, z30.d + ; CHECK-NEXT: mov z30.d, z31.d + ; CHECK-NEXT: mov z31.d, z7.d + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] + ; CHECK-NEXT: ldp d9, d8, [sp, #16] + ; CHECK-NEXT: st1b { z0.b }, p0, [x0] + ; CHECK-NEXT: st1b { z17.b }, p0, [x0] + ; CHECK-NEXT: ldp d11, d10, [sp], #32 + ; CHECK-NEXT: ret + + %2:zpr = COPY $z0 + %3:zpr = COPY $z17 + %1:gpr64 = COPY $x1 + %0:gpr64common = COPY $x0 + %27:matrixindexgpr32_8_11 = COPY $wzr + %14:gpr64 = UBFMXri %1, 63, 62 + %pred:pnr_p8to15 = PTRUE_C_B implicit $vg + %4:zpr4stridedorcontiguous = LD1B_4Z_IMM_PSEUDO %pred, %0, 0 + %20:gpr64 = ADDXrr %14, %1 + %9:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %1 + %15:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %14 + %21:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %20 + %26:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub0, %9.zsub0, %15.zsub0, %21.zsub0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %27, 0, %26, undef %28:zpr_4b, 0 + %29:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub1, %9.zsub1, %15.zsub1, %21.zsub1 + $za = UDOT_VG4_M4ZZI_BtoS $za, %27, 0, %29, undef %30:zpr_4b, 0 + %31:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub2, %9.zsub2, %15.zsub2, %21.zsub2 + %35:ppr_3b = PTRUE_B 31, implicit $vg + $za = UDOT_VG4_M4ZZI_BtoS $za, %27, 0, %31, undef %32:zpr_4b, 0 + %33:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub3, %9.zsub3, %15.zsub3, %21.zsub3 + $za = UDOT_VG4_M4ZZI_BtoS $za, %27, 0, %33, undef %34:zpr_4b, 0 + ST1B_IMM %2, %35, %0, 0 :: (store () into %ir.ptr) + ST1B_IMM %3, %35, %0, 0 :: (store () into %ir.ptr) + RET_ReallyLR + +... From 7635e0e987dbc4b6b06c4781e4d0039d794705e6 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Fri, 17 Jan 2025 10:19:39 +0000 Subject: [PATCH 3/7] - Instead of finding a hint based on the operand index from the FORM_TRANSPOSED pseudo, find hints based on whether any operands have been allocated yet. - Remove any_of which finds FORM_TRANSPOSED uses of VirtReg & instead iterate through all uses. - Clean up sme2-multivec-regalloc.mir & add new test which changes the allocation order of the load instructions. --- .../Target/AArch64/AArch64RegisterInfo.cpp | 103 +++---- .../AArch64/sme2-multivec-regalloc.mir | 266 ++++++++---------- 2 files changed, 172 insertions(+), 197 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index aac1dc9cb5c06..44eedddc1d880 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1108,20 +1108,16 @@ bool AArch64RegisterInfo::getRegAllocationHints( // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy // instructions over reducing the number of clobbered callee-save registers, // so we add the strided registers as a hint. - const MachineInstr *TupleInst = nullptr; unsigned RegID = MRI.getRegClass(VirtReg)->getID(); // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE. - if ((RegID == AArch64::ZPR2StridedOrContiguousRegClassID || - RegID == AArch64::ZPR4StridedOrContiguousRegClassID) && - any_of(MRI.use_nodbg_instructions(VirtReg), [&TupleInst]( - const MachineInstr &Use) { - bool IsTuple = - Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || - Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO; - TupleInst = &Use; - return IsTuple; - })) { - unsigned LdOps = TupleInst->getNumOperands() - 1; + for (const MachineInstr &Use : MRI.use_nodbg_instructions(VirtReg)) { + if ((RegID != AArch64::ZPR2StridedOrContiguousRegClassID && + RegID != AArch64::ZPR4StridedOrContiguousRegClassID) || + (Use.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO && + Use.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)) + continue; + + unsigned LdOps = Use.getNumOperands() - 1; const TargetRegisterClass *StridedRC = LdOps == 2 ? &AArch64::ZPR2StridedRegClass : &AArch64::ZPR4StridedRegClass; @@ -1131,63 +1127,76 @@ bool AArch64RegisterInfo::getRegAllocationHints( if (StridedRC->contains(Reg)) StridedOrder.push_back(Reg); - int OpIdx = TupleInst->findRegisterUseOperandIdx(VirtReg, this); - if (OpIdx == -1) - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, - MF, VRM); + auto GetRegStartingAt = [&](MCPhysReg FirstReg) -> MCPhysReg { + for (MCPhysReg Strided : StridedOrder) + if (getSubReg(Strided, AArch64::zsub0) == FirstReg) + return Strided; + return (MCPhysReg)AArch64::NoRegister; + }; + + int OpIdx = Use.findRegisterUseOperandIdx(VirtReg, this); + assert(OpIdx != -1 && "Expected operand index from register use."); - unsigned TupleID = - MRI.getRegClass(TupleInst->getOperand(0).getReg())->getID(); + unsigned TupleID = MRI.getRegClass(Use.getOperand(0).getReg())->getID(); bool IsMulZPR = TupleID == AArch64::ZPR2Mul2RegClassID || TupleID == AArch64::ZPR4Mul4RegClassID; - if (OpIdx == 1) { + unsigned AssignedOp = 0; + if (!any_of(make_range(Use.operands_begin() + 1, Use.operands_end()), + [&](const MachineOperand &Op) { + if (!VRM->hasPhys(Op.getReg())) + return false; + AssignedOp = Op.getOperandNo(); + return true; + })) { + // There are no registers already assigned to any of the pseudo operands. + // Look for a valid starting register for the group. for (unsigned I = 0; I < StridedOrder.size(); ++I) { MCPhysReg Reg = StridedOrder[I]; unsigned FirstReg = getSubReg(Reg, AArch64::zsub0); // If the FORM_TRANSPOSE nodes use the ZPRMul classes, the starting // register of the first load should be a multiple of 2 or 4. - if (IsMulZPR && - (getSubReg(Reg, AArch64::zsub0) - AArch64::Z0) % LdOps != 0) + if (IsMulZPR && (FirstReg - AArch64::Z0) % LdOps != 0) continue; // Skip this register if it has any live intervals assigned. if (Matrix->isPhysRegUsed(Reg)) continue; - bool CanAssign = true; + // Look for registers in StridedOrder which start with sub-registers + // following sequentially from FirstReg. If all are found and none are + // already live, add Reg to Hints. + MCPhysReg RegToAssign = Reg; for (unsigned Next = 1; Next < LdOps; ++Next) { - // Ensure we can assign enough registers from the list for all loads. - if (I + Next >= StridedOrder.size()) { - CanAssign = false; - break; - } - // Ensure the subsequent registers are not live and that the starting - // sub-registers are sequential. - MCPhysReg NextReg = StridedOrder[I + Next]; - if (Matrix->isPhysRegUsed(NextReg) || - (getSubReg(NextReg, AArch64::zsub0) != FirstReg + Next)) { - CanAssign = false; + MCPhysReg Strided = GetRegStartingAt(FirstReg + Next); + if (Strided == AArch64::NoRegister || + Matrix->isPhysRegUsed(Strided)) { + RegToAssign = AArch64::NoRegister; break; } + if (Next == (unsigned)OpIdx - 1) + RegToAssign = Strided; } - if (CanAssign) - Hints.push_back(Reg); - } - } else if (VRM->hasPhys(TupleInst->getOperand(1).getReg())) { - // This is not the first load in the sequence. Find the register - // assigned to the first and match to a strided reg in the list. - MCPhysReg FirstLoadPhysReg = - VRM->getPhys(TupleInst->getOperand(1).getReg()); - for (unsigned I = 0; I < StridedOrder.size(); ++I) { - if (StridedOrder[I] == FirstLoadPhysReg && - (I + (OpIdx - 1) < StridedOrder.size())) - Hints.push_back(StridedOrder[I + (OpIdx - 1)]); + if (RegToAssign != AArch64::NoRegister) + Hints.push_back(RegToAssign); } + } else { + // At least one operand already has a physical register assigned. + // Find the starting sub-register of this and use it to work out the + // correct strided register to suggest based on the current op index. + MCPhysReg TargetStartReg = + getSubReg(VRM->getPhys(Use.getOperand(AssignedOp).getReg()), + AArch64::zsub0) + + (OpIdx - AssignedOp); + + for (unsigned I = 0; I < StridedOrder.size(); ++I) + if (getSubReg(StridedOrder[I], AArch64::zsub0) == TargetStartReg) + Hints.push_back(StridedOrder[I]); } - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, - VRM); + if (!Hints.empty()) + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, + MF, VRM); } for (MachineInstr &MI : MRI.def_instructions(VirtReg)) { diff --git a/llvm/test/CodeGen/AArch64/sme2-multivec-regalloc.mir b/llvm/test/CodeGen/AArch64/sme2-multivec-regalloc.mir index b5abc5d4ee67c..a4bbee41cfff8 100644 --- a/llvm/test/CodeGen/AArch64/sme2-multivec-regalloc.mir +++ b/llvm/test/CodeGen/AArch64/sme2-multivec-regalloc.mir @@ -1,183 +1,149 @@ -# RUN: llc -force-streaming -verify-machineinstrs -enable-subreg-liveness -start-before=greedy %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs -enable-subreg-liveness -start-before=greedy %s -o - | FileCheck %s # No available group of four strided x4 registers, fall back on default allocation order ---- | - ; ModuleID = '' - source_filename = "" - target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" - target triple = "aarch64-linux-gnu" - - ; Function Attrs: nounwind - define void @form_4x_tuple_many_live(ptr %ptr, i64 %stride, %scalable_arg) #0 { - entry: - %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() - %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) - %2 = extractvalue { , , , } %1, 0 - %3 = extractvalue { , , , } %1, 1 - %4 = extractvalue { , , , } %1, 2 - %5 = extractvalue { , , , } %1, 3 - %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride - %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) - %7 = extractvalue { , , , } %6, 0 - %8 = extractvalue { , , , } %6, 1 - %9 = extractvalue { , , , } %6, 2 - %10 = extractvalue { , , , } %6, 3 - %mul3 = shl i64 %stride, 1 - %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 - %11 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) - %12 = extractvalue { , , , } %11, 0 - %13 = extractvalue { , , , } %11, 1 - %14 = extractvalue { , , , } %11, 2 - %15 = extractvalue { , , , } %11, 3 - %mul5 = mul i64 %stride, 3 - %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 - %16 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) - %17 = extractvalue { , , , } %16, 0 - %18 = extractvalue { , , , } %16, 1 - %19 = extractvalue { , , , } %16, 2 - %20 = extractvalue { , , , } %16, 3 - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %2, %7, %12, %17, poison, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) - tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) - store %scalable_arg, ptr %ptr, align 16 - ret void - } - - ; Function Attrs: nocallback nofree nosync nounwind willreturn - declare void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32, , , , , , i32 immarg) #1 - - ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) - declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr) #2 - - ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) - declare target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() #3 - - attributes #0 = { nounwind "target-features"="+sme2" } - attributes #1 = { nocallback nofree nosync nounwind willreturn } - attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } - attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) } - -... --- name: form_4x_tuple_many_live -alignment: 4 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false tracksRegLiveness: true -hasWinCFI: false -noPhis: true -isSSA: false -noVRegs: false -hasFakeUses: false -callsEHReturn: false -callsUnwindInit: false -hasEHCatchret: false -hasEHScopes: false -hasEHFunclets: false -isOutlined: false -debugInstrRef: false -failsVerification: false -tracksDebugUserValues: false liveins: - { reg: '$x0', virtual-reg: '%0' } - { reg: '$x1', virtual-reg: '%1' } - { reg: '$z0', virtual-reg: '%2' } - { reg: '$z17', virtual-reg: '%3' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1 - adjustsStack: false - hasCalls: false - stackProtector: '' - functionContext: '' - maxCallFrameSize: 0 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - isCalleeSavedInfoValid: false - localFrameSize: 0 - savePoint: '' - restorePoint: '' -fixedStack: [] -stack: [] -entry_values: [] -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: {} +stack: + - { id: 0, name: '', type: default, offset: 0, size: 32, alignment: 16, + stack-id: scalable-vector, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } body: | bb.0.entry: liveins: $x0, $x1, $z0, $z17 ; CHECK-LABEL: form_4x_tuple_many_live - ; CHECK: stp d11, d10, [sp, #-32]! - ; CHECK-NEXT: lsl x9, x1, #1 - ; CHECK-NEXT: stp d9, d8, [sp, #16] - ; CHECK-NEXT: ptrue pn8.b + ; CHECK: stp d11, d10, [sp, #-48]! + ; CHECK-NEXT: stp d9, d8, [sp, #16] + ; CHECK-NEXT: str x29, [sp, #32] + ; CHECK-NEXT: addvl sp, sp, #-2 + ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG + ; CHECK-NEXT: .cfi_offset w29, -16 + ; CHECK-NEXT: .cfi_offset b8, -24 + ; CHECK-NEXT: .cfi_offset b9, -32 + ; CHECK-NEXT: .cfi_offset b10, -40 + ; CHECK-NEXT: .cfi_offset b11, -48 + ; CHECK-NEXT: lsl x9, x1, #1 + ; CHECK-NEXT: ptrue pn8.b + ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x1] - ; CHECK-NEXT: mov w8, wzr - ; CHECK-NEXT: add x10, x9, x1 + ; CHECK-NEXT: ptrue p0.b + ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x9] - ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x10] - ; CHECK-NEXT: mov z8.d, z16.d - ; CHECK-NEXT: mov z9.d, z18.d - ; CHECK-NEXT: mov z21.d, z22.d - ; CHECK-NEXT: mov z10.d, z19.d - ; CHECK-NEXT: mov z22.d, z23.d - ; CHECK-NEXT: mov z25.d, z26.d - ; CHECK-NEXT: mov z11.d, z4.d - ; CHECK-NEXT: mov z23.d, z5.d - ; CHECK-NEXT: mov z26.d, z27.d - ; CHECK-NEXT: mov z27.d, z6.d - ; CHECK-NEXT: mov z29.d, z30.d - ; CHECK-NEXT: mov z30.d, z31.d - ; CHECK-NEXT: mov z31.d, z7.d + ; CHECK-NEXT: mov z8.d, z16.d + ; CHECK-NEXT: mov z9.d, z18.d + ; CHECK-NEXT: mov z21.d, z22.d + ; CHECK-NEXT: mov z10.d, z19.d + ; CHECK-NEXT: mov z22.d, z23.d + ; CHECK-NEXT: mov z25.d, z26.d + ; CHECK-NEXT: mov z11.d, z4.d + ; CHECK-NEXT: mov z23.d, z5.d + ; CHECK-NEXT: mov z26.d, z27.d + ; CHECK-NEXT: mov z27.d, z6.d + ; CHECK-NEXT: mov z29.d, z30.d + ; CHECK-NEXT: mov z30.d, z31.d + ; CHECK-NEXT: mov z31.d, z7.d ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] - ; CHECK-NEXT: ldp d9, d8, [sp, #16] ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: st1b { z17.b }, p0, [x0] - ; CHECK-NEXT: ldp d11, d10, [sp], #32 + ; CHECK-NEXT: addvl sp, sp, #2 + ; CHECK-NEXT: ldp d9, d8, [sp, #16] + ; CHECK-NEXT: ldr x29, [sp, #32] + ; CHECK-NEXT: ldp d11, d10, [sp], #48 ; CHECK-NEXT: ret + %0:gpr64common = COPY $x0 + %1:gpr64 = COPY $x1 %2:zpr = COPY $z0 %3:zpr = COPY $z17 - %1:gpr64 = COPY $x1 - %0:gpr64common = COPY $x0 - %27:matrixindexgpr32_8_11 = COPY $wzr - %14:gpr64 = UBFMXri %1, 63, 62 + %5:matrixindexgpr32_8_11 = COPY $wzr + %6:gpr64 = UBFMXri %1, 63, 62 %pred:pnr_p8to15 = PTRUE_C_B implicit $vg - %4:zpr4stridedorcontiguous = LD1B_4Z_IMM_PSEUDO %pred, %0, 0 - %20:gpr64 = ADDXrr %14, %1 - %9:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %1 - %15:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %14 - %21:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %20 - %26:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub0, %9.zsub0, %15.zsub0, %21.zsub0 - $za = UDOT_VG4_M4ZZI_BtoS $za, %27, 0, %26, undef %28:zpr_4b, 0 - %29:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub1, %9.zsub1, %15.zsub1, %21.zsub1 - $za = UDOT_VG4_M4ZZI_BtoS $za, %27, 0, %29, undef %30:zpr_4b, 0 - %31:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub2, %9.zsub2, %15.zsub2, %21.zsub2 - %35:ppr_3b = PTRUE_B 31, implicit $vg - $za = UDOT_VG4_M4ZZI_BtoS $za, %27, 0, %31, undef %32:zpr_4b, 0 - %33:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub3, %9.zsub3, %15.zsub3, %21.zsub3 - $za = UDOT_VG4_M4ZZI_BtoS $za, %27, 0, %33, undef %34:zpr_4b, 0 - ST1B_IMM %2, %35, %0, 0 :: (store () into %ir.ptr) - ST1B_IMM %3, %35, %0, 0 :: (store () into %ir.ptr) + %7:ppr_3b = PTRUE_B 31, implicit $vg + %8:gpr64 = ADDXrr %6, %1 + %9:zpr4stridedorcontiguous = LD1B_4Z_IMM_PSEUDO %pred, %0, 0 + %10:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %1 + %11:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %6 + %12:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %8 + %13:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub0, %10.zsub0, %11.zsub0, %12.zsub0 + %14:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub1, %10.zsub1, %11.zsub1, %12.zsub1 + %15:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub2, %10.zsub2, %11.zsub2, %12.zsub2 + %16:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub3, %10.zsub3, %11.zsub3, %12.zsub3 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %13, undef %28:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %14, undef %30:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %15, undef %32:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %16, undef %34:zpr_4b, 0 + ST1B_IMM %2, %7, %0, 0 :: (store () into %stack.0) + ST1B_IMM %3, %7, %0, 0 :: (store () into %stack.0) RET_ReallyLR +... +--- +name: form_4x_tuple_allocation_order +tracksRegLiveness: true +liveins: + - { reg: '$x0', virtual-reg: '%0' } + - { reg: '$x1', virtual-reg: '%1' } + - { reg: '$z0', virtual-reg: '%2' } +stack: + - { id: 0, name: '', type: default, offset: 0, size: 32, alignment: 16, + stack-id: scalable-vector, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.0.entry: + liveins: $x0, $x1, $z0 + + ; CHECK: str x29, [sp, #-16]! + ; CHECK-NEXT: addvl sp, sp, #-2 + ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG + ; CHECK-NEXT: .cfi_offset w29, -16 + ; CHECK-NEXT: lsl x9, x1, #1 + ; CHECK-NEXT: ptrue pn8.b + ; CHECK-NEXT: mov w8, wzr + ; CHECK-NEXT: ptrue p0.b + ; CHECK-NEXT: add x10, x9, x1 + ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] + ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] + ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] + ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] + ; CHECK-NEXT: st1b { z0.b }, p0, [x0] + ; CHECK-NEXT: addvl sp, sp, #2 + ; CHECK-NEXT: ldr x29, [sp], #16 + ; CHECK-NEXT: ret + %0:gpr64common = COPY $x0 + %1:gpr64 = COPY $x1 + %2:zpr = COPY $z0 + %5:matrixindexgpr32_8_11 = COPY $wzr + %6:gpr64 = UBFMXri %1, 63, 62 + %pred:pnr_p8to15 = PTRUE_C_B implicit $vg + %7:ppr_3b = PTRUE_B 31, implicit $vg + %8:gpr64 = ADDXrr %6, %1 + %9:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %8 + %10:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %6 + %11:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %1 + %12:zpr4stridedorcontiguous = LD1B_4Z_IMM_PSEUDO %pred, %0, 0 + %13:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub0, %11.zsub0, %10.zsub0, %9.zsub0 + %14:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub1, %11.zsub1, %10.zsub1, %9.zsub1 + %15:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub2, %11.zsub2, %10.zsub2, %9.zsub2 + %16:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub3, %11.zsub3, %10.zsub3, %9.zsub3 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %13, undef %28:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %14, undef %30:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %15, undef %32:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %16, undef %34:zpr_4b, 0 + ST1B_IMM %2, %7, %0, 0 :: (store () into %stack.0) + RET_ReallyLR ... From 81c3d47296564e81690caa498658557a84d18a43 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Mon, 27 Jan 2025 15:16:35 +0000 Subject: [PATCH 4/7] - Rewrote search for operand which already has a register assigned - Add early exit if the function does not have SME or is not in streaming mode - Move loop over uses of VirtReg into if block checking the RegID - Remove GetRegStartingAt --- .../Target/AArch64/AArch64RegisterInfo.cpp | 161 ++++++------ .../AArch64/sme2-intrinsics-int-dots.ll | 240 ++++++++---------- .../AArch64/sme2-multivec-regalloc.mir | 53 +++- 3 files changed, 225 insertions(+), 229 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 44eedddc1d880..7ec3dec5d1b1b 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1100,6 +1100,11 @@ bool AArch64RegisterInfo::getRegAllocationHints( const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &ST = MF.getSubtarget(); + if (!ST.hasSME() || !ST.isStreaming()) + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, + VRM); + // The SVE calling convention preserves registers Z8-Z23. As a result, there // are no ZPR2Strided or ZPR4Strided registers that do not overlap with the // callee-saved registers and so by default these will be pushed to the back @@ -1109,94 +1114,82 @@ bool AArch64RegisterInfo::getRegAllocationHints( // instructions over reducing the number of clobbered callee-save registers, // so we add the strided registers as a hint. unsigned RegID = MRI.getRegClass(VirtReg)->getID(); - // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE. - for (const MachineInstr &Use : MRI.use_nodbg_instructions(VirtReg)) { - if ((RegID != AArch64::ZPR2StridedOrContiguousRegClassID && - RegID != AArch64::ZPR4StridedOrContiguousRegClassID) || - (Use.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO && - Use.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)) - continue; - - unsigned LdOps = Use.getNumOperands() - 1; - const TargetRegisterClass *StridedRC = LdOps == 2 - ? &AArch64::ZPR2StridedRegClass - : &AArch64::ZPR4StridedRegClass; - - SmallVector StridedOrder; - for (MCPhysReg Reg : Order) - if (StridedRC->contains(Reg)) - StridedOrder.push_back(Reg); - - auto GetRegStartingAt = [&](MCPhysReg FirstReg) -> MCPhysReg { - for (MCPhysReg Strided : StridedOrder) - if (getSubReg(Strided, AArch64::zsub0) == FirstReg) - return Strided; - return (MCPhysReg)AArch64::NoRegister; - }; - - int OpIdx = Use.findRegisterUseOperandIdx(VirtReg, this); - assert(OpIdx != -1 && "Expected operand index from register use."); - - unsigned TupleID = MRI.getRegClass(Use.getOperand(0).getReg())->getID(); - bool IsMulZPR = TupleID == AArch64::ZPR2Mul2RegClassID || - TupleID == AArch64::ZPR4Mul4RegClassID; - - unsigned AssignedOp = 0; - if (!any_of(make_range(Use.operands_begin() + 1, Use.operands_end()), - [&](const MachineOperand &Op) { - if (!VRM->hasPhys(Op.getReg())) - return false; - AssignedOp = Op.getOperandNo(); - return true; - })) { - // There are no registers already assigned to any of the pseudo operands. - // Look for a valid starting register for the group. - for (unsigned I = 0; I < StridedOrder.size(); ++I) { - MCPhysReg Reg = StridedOrder[I]; - unsigned FirstReg = getSubReg(Reg, AArch64::zsub0); - - // If the FORM_TRANSPOSE nodes use the ZPRMul classes, the starting - // register of the first load should be a multiple of 2 or 4. - if (IsMulZPR && (FirstReg - AArch64::Z0) % LdOps != 0) - continue; - // Skip this register if it has any live intervals assigned. - if (Matrix->isPhysRegUsed(Reg)) - continue; - - // Look for registers in StridedOrder which start with sub-registers - // following sequentially from FirstReg. If all are found and none are - // already live, add Reg to Hints. - MCPhysReg RegToAssign = Reg; - for (unsigned Next = 1; Next < LdOps; ++Next) { - MCPhysReg Strided = GetRegStartingAt(FirstReg + Next); - if (Strided == AArch64::NoRegister || - Matrix->isPhysRegUsed(Strided)) { - RegToAssign = AArch64::NoRegister; - break; + if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID || + RegID == AArch64::ZPR4StridedOrContiguousRegClassID) { + + // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE. + for (const MachineInstr &Use : MRI.use_nodbg_instructions(VirtReg)) { + if (Use.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO && + Use.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) + continue; + + unsigned LdOps = Use.getNumOperands() - 1; + const TargetRegisterClass *StridedRC = + LdOps == 2 ? &AArch64::ZPR2StridedRegClass + : &AArch64::ZPR4StridedRegClass; + + SmallVector StridedOrder; + for (MCPhysReg Reg : Order) + if (StridedRC->contains(Reg)) + StridedOrder.push_back(Reg); + + int OpIdx = Use.findRegisterUseOperandIdx(VirtReg, this); + assert(OpIdx != -1 && "Expected operand index from register use."); + + unsigned TupleID = MRI.getRegClass(Use.getOperand(0).getReg())->getID(); + bool IsMulZPR = TupleID == AArch64::ZPR2Mul2RegClassID || + TupleID == AArch64::ZPR4Mul4RegClassID; + + const MachineOperand *AssignedRegOp = llvm::find_if( + make_range(Use.operands_begin() + 1, Use.operands_end()), + [&VRM](const MachineOperand &Op) { + return VRM->hasPhys(Op.getReg()); + }); + + if (AssignedRegOp == Use.operands_end()) { + // There are no registers already assigned to any of the pseudo + // operands. Look for a valid starting register for the group. + for (unsigned I = 0; I < StridedOrder.size(); ++I) { + MCPhysReg Reg = StridedOrder[I]; + SmallVector Regs; + unsigned FirstStridedReg = Reg - OpIdx + 1; + + // If the FORM_TRANSPOSE nodes use the ZPRMul classes, the starting + // register of the first load should be a multiple of 2 or 4. + unsigned FirstSubReg = getSubReg(FirstStridedReg, AArch64::zsub0); + if (IsMulZPR && (FirstSubReg - AArch64::Z0) % LdOps != 0) + continue; + + for (unsigned Op = 0; Op < LdOps; ++Op) { + if (!is_contained(StridedOrder, FirstStridedReg + Op) || + getSubReg(FirstStridedReg + Op, AArch64::zsub0) != + FirstSubReg + Op) + break; + Regs.push_back(FirstStridedReg + Op); } - if (Next == (unsigned)OpIdx - 1) - RegToAssign = Strided; + + if (Regs.size() == LdOps && all_of(Regs, [&](MCPhysReg R) { + return !Matrix->isPhysRegUsed(R); + })) + Hints.push_back(FirstStridedReg + OpIdx - 1); } - if (RegToAssign != AArch64::NoRegister) - Hints.push_back(RegToAssign); + } else { + // At least one operand already has a physical register assigned. + // Find the starting sub-register of this and use it to work out the + // correct strided register to suggest based on the current op index. + MCPhysReg TargetStartReg = + getSubReg(VRM->getPhys(AssignedRegOp->getReg()), AArch64::zsub0) + + (OpIdx - AssignedRegOp->getOperandNo()); + + for (unsigned I = 0; I < StridedOrder.size(); ++I) + if (getSubReg(StridedOrder[I], AArch64::zsub0) == TargetStartReg) + Hints.push_back(StridedOrder[I]); } - } else { - // At least one operand already has a physical register assigned. - // Find the starting sub-register of this and use it to work out the - // correct strided register to suggest based on the current op index. - MCPhysReg TargetStartReg = - getSubReg(VRM->getPhys(Use.getOperand(AssignedOp).getReg()), - AArch64::zsub0) + - (OpIdx - AssignedOp); - - for (unsigned I = 0; I < StridedOrder.size(); ++I) - if (getSubReg(StridedOrder[I], AArch64::zsub0) == TargetStartReg) - Hints.push_back(StridedOrder[I]); - } - if (!Hints.empty()) - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, - MF, VRM); + if (!Hints.empty()) + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, + MF, VRM); + } } for (MachineInstr &MI : MRI.def_instructions(VirtReg)) { diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index db177fac3b265..379e4116511e8 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -371,47 +371,39 @@ define void @udot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, ) into %stack.0) RET_ReallyLR ... + +# First multi-vector load to be allocated is not the first operand of the FORM_TRANSPOSED pseudo --- name: form_4x_tuple_allocation_order tracksRegLiveness: true -liveins: - - { reg: '$x0', virtual-reg: '%0' } - - { reg: '$x1', virtual-reg: '%1' } - - { reg: '$z0', virtual-reg: '%2' } stack: - { id: 0, name: '', type: default, offset: 0, size: 32, alignment: 16, stack-id: scalable-vector, callee-saved-register: '', callee-saved-restored: true, @@ -147,3 +140,45 @@ body: | ST1B_IMM %2, %7, %0, 0 :: (store () into %stack.0) RET_ReallyLR ... + +# Strided order is [ $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 ] +# Ensure we don't allocate $z23_z31 & $z0_z8 although they are consecutive +--- + name: udot_form_2x_tuple_live_reg_order + tracksRegLiveness: true + body: | + bb.0.entry: + liveins: $x0, $x1, $z16, $z17, $z18, $z19, $z20, $z21, $z22 + + ; CHECK: stp d9, d8, [sp, #-16]! + ; CHECK-NEXT: .cfi_def_cfa_offset 16 + ; CHECK-NEXT: .cfi_offset b8, -8 + ; CHECK-NEXT: .cfi_offset b9, -16 + ; CHECK-NEXT: ptrue pn8.b + ; CHECK-NEXT: mov w8, wzr + ; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] + ; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] + ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b + ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b + ; CHECK-NEXT: ldp d9, d8, [sp], #16 + ; CHECK-NEXT: ret + + %0:gpr64 = COPY $x1 + %1:gpr64common = COPY $x0 + %2:zpr = COPY $z16 + %3:zpr = COPY $z17 + %4:zpr = COPY $z18 + %5:zpr = COPY $z19 + %6:zpr = COPY $z20 + %7:zpr = COPY $z21 + %8:zpr = COPY $z22 + %9:matrixindexgpr32_8_11 = COPY $wzr + %10:pnr_p8to15 = PTRUE_C_B implicit $vg + %11:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO %10, %1, 0 + %12:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO %10, %1, %0 + %13:zpr2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %11.zsub0, %12.zsub0 + %14:zpr2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %11.zsub1, %12.zsub1 + $za = UDOT_VG2_M2ZZ_BtoS $za, %9, 0, %13, undef %15:zpr_4b + $za = UDOT_VG2_M2ZZ_BtoS $za, %9, 0, %14, undef %16:zpr_4b + RET_ReallyLR +... From 607e3df04896f90c2dcc58c838f8839dc4bb3086 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Wed, 29 Jan 2025 14:01:19 +0000 Subject: [PATCH 5/7] - Allow mix of x2 & x4 multivector loads and intrinsics --- .../Target/AArch64/AArch64ISelLowering.cpp | 19 ++-- .../Target/AArch64/AArch64RegisterInfo.cpp | 5 +- .../AArch64/sme2-intrinsics-int-dots.ll | 99 +++++++++++++++++++ 3 files changed, 109 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4ede1fb93fe5f..4bdfff1f47ed0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8763,17 +8763,9 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { bool shouldUseFormStridedPseudo(MachineInstr &MI) { MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - const TargetRegisterClass *RegClass = nullptr; - switch (MI.getOpcode()) { - case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO: - RegClass = &AArch64::ZPR2StridedOrContiguousRegClass; - break; - case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO: - RegClass = &AArch64::ZPR4StridedOrContiguousRegClass; - break; - default: - llvm_unreachable("Unexpected opcode."); - } + assert((MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || + MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) && + "Unexpected opcode."); MCRegister SubReg = MCRegister::NoRegister; for (unsigned I = 1; I < MI.getNumOperands(); ++I) { @@ -8790,8 +8782,11 @@ bool shouldUseFormStridedPseudo(MachineInstr &MI) { SubReg = OpSubReg; MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg()); + const TargetRegisterClass *CopySrcClass = + MRI.getRegClass(CopySrcOp->getReg()); if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg || - MRI.getRegClass(CopySrcOp->getReg()) != RegClass) + (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass && + CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass)) return false; } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 7ec3dec5d1b1b..c5d9cd0920f30 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1125,8 +1125,9 @@ bool AArch64RegisterInfo::getRegAllocationHints( unsigned LdOps = Use.getNumOperands() - 1; const TargetRegisterClass *StridedRC = - LdOps == 2 ? &AArch64::ZPR2StridedRegClass - : &AArch64::ZPR4StridedRegClass; + RegID == AArch64::ZPR2StridedOrContiguousRegClassID + ? &AArch64::ZPR2StridedRegClass + : &AArch64::ZPR4StridedRegClass; SmallVector StridedOrder; for (MCPhysReg Reg : Order) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index 379e4116511e8..d8d796e392b23 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -354,6 +354,53 @@ entry: ret void } +define void @udot_single_za32_u16_vg1x2_x4load_x2tuple(ptr %ptr, i64 %stride, %zn) #0 { +; CHECK-LABEL: udot_single_za32_u16_vg1x2_x4load_x2tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: add x9, x0, x1 +; CHECK-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z1.h, z5.h, z9.h, z13.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z2.h, z6.h, z10.h, z14.h }, pn8/z, [x9] +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z0.h +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z5.h, z6.h }, z0.h +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z9.h, z10.h }, z0.h +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z13.h, z14.h }, z0.h +; CHECK-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, %2, %7, %zn) + call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, %3, %8, %zn) + call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, %4, %9, %zn) + call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, %5, %10, %zn) + ret void +} + define void @udot_single_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, %zn4) #0 { ; CHECK-LABEL: udot_single_za32_u16_vg1x4: ; CHECK: // %bb.0: @@ -1196,6 +1243,58 @@ entry: ret void } +define void @udot_single_za32_u16_vg1x4_x2load_x4tuple(ptr %ptr, i64 %stride, %zn) #0 { +; CHECK-LABEL: udot_single_za32_u16_vg1x4_x2load_x4tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z12, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z9, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z4.b, z12.b }, pn8/z, [x0, x10] +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z0.b +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z9.b - z12.b }, z0.b +; CHECK-NEXT: ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %7 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %8 = extractvalue { , } %7, 0 + %9 = extractvalue { , } %7, 1 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %10 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %11 = extractvalue { , } %10, 0 + %12 = extractvalue { , } %10, 1 + call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 0, %2, %5, %8, %11, %zn) + call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 0, %3, %6, %9, %12, %zn) + ret void +} + define void @udot_lane_za64_u16_vg1x2(i32 %slice, %unused, %zn0, %zn1, %zn2) #1 { ; CHECK-LABEL: udot_lane_za64_u16_vg1x2: ; CHECK: // %bb.0: From 65e937d68d12e0ac82a17932449614c766eaafd5 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Wed, 29 Jan 2025 17:24:52 +0000 Subject: [PATCH 6/7] - Renamed LdOps to UseOps - Rewrote the case where no registers are already assigned to avoid creating the Regs vector. - Added more comments with an example --- .../Target/AArch64/AArch64RegisterInfo.cpp | 82 ++++++++++++++----- 1 file changed, 62 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index c5d9cd0920f30..b0f9f4ecdf351 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1123,11 +1123,18 @@ bool AArch64RegisterInfo::getRegAllocationHints( Use.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) continue; - unsigned LdOps = Use.getNumOperands() - 1; - const TargetRegisterClass *StridedRC = - RegID == AArch64::ZPR2StridedOrContiguousRegClassID - ? &AArch64::ZPR2StridedRegClass - : &AArch64::ZPR4StridedRegClass; + unsigned UseOps = Use.getNumOperands() - 1; + const TargetRegisterClass *StridedRC; + switch (RegID) { + case AArch64::ZPR2StridedOrContiguousRegClassID: + StridedRC = &AArch64::ZPR2StridedRegClass; + break; + case AArch64::ZPR4StridedOrContiguousRegClassID: + StridedRC = &AArch64::ZPR4StridedRegClass; + break; + default: + llvm_unreachable("Unexpected RegID"); + } SmallVector StridedOrder; for (MCPhysReg Reg : Order) @@ -1147,32 +1154,67 @@ bool AArch64RegisterInfo::getRegAllocationHints( return VRM->hasPhys(Op.getReg()); }); + // Example: + // + // When trying to find a suitable register allocation for VirtReg %v2 in: + // + // %v0:zpr2stridedorcontiguous = ld1 p0/z, [...] + // %v1:zpr2stridedorcontiguous = ld1 p0/z, [...] + // %v2:zpr2stridedorcontiguous = ld1 p0/z, [...] + // %v3:zpr2stridedorcontiguous = ld1 p0/z, [...] + // %v4:zpr4mul4 = FORM_TRANSPOSED_X4 %v0:0, %v1:0, %v2:0, %v3:0 + // + // One such suitable allocation would be: + // + // { z0, z8 } = ld1 p0/z, [...] + // { z1, z9 } = ld1 p0/z, [...] + // { z2, z10 } = ld1 p0/z, [...] + // { z3, z11 } = ld1 p0/z, [...] + // { z0, z1, z2, z3 } = + // FORM_TRANSPOSED_X4 {z0, z8}:0, {z1, z9}:0, {z2, z10}:0, {z3, z11}:0 + // + // Below we distinguish two cases when trying to find a register: + // * None of the registers used by FORM_TRANSPOSED_X4 have been assigned + // yet. In this case the code muse ensure that there are at least UseOps + // free consecutive registers. If IsMulZPR is true, then the first of + // registers must also be a multiple of UseOps, e.g. { z0, z1, z2, z3 } + // is valid but { z1, z2, z3, z5 } is not. + // * One or more of the registers used by FORM_TRANSPOSED_X4 is already + // assigned a physical register, which means only checking that a + // consectutive range of free tuple registers exists which includes + // the assigned register. + // e.g. in the example above, if { z0, z8 } is already allocated for + // %v0, we just need to ensure that { z1, z9 }, { z2, z10 } and + // { z3, z11 } are also free. If so, we add { z2, z10 }. + if (AssignedRegOp == Use.operands_end()) { // There are no registers already assigned to any of the pseudo // operands. Look for a valid starting register for the group. for (unsigned I = 0; I < StridedOrder.size(); ++I) { MCPhysReg Reg = StridedOrder[I]; SmallVector Regs; - unsigned FirstStridedReg = Reg - OpIdx + 1; // If the FORM_TRANSPOSE nodes use the ZPRMul classes, the starting // register of the first load should be a multiple of 2 or 4. - unsigned FirstSubReg = getSubReg(FirstStridedReg, AArch64::zsub0); - if (IsMulZPR && (FirstSubReg - AArch64::Z0) % LdOps != 0) + unsigned SubRegIdx = Use.getOperand(OpIdx).getSubReg(); + if (IsMulZPR && (getSubReg(Reg, SubRegIdx) - AArch64::Z0) % UseOps != + ((unsigned)OpIdx - 1)) continue; - for (unsigned Op = 0; Op < LdOps; ++Op) { - if (!is_contained(StridedOrder, FirstStridedReg + Op) || - getSubReg(FirstStridedReg + Op, AArch64::zsub0) != - FirstSubReg + Op) - break; - Regs.push_back(FirstStridedReg + Op); - } - - if (Regs.size() == LdOps && all_of(Regs, [&](MCPhysReg R) { - return !Matrix->isPhysRegUsed(R); - })) - Hints.push_back(FirstStridedReg + OpIdx - 1); + // In the example above, if VirtReg is the third operand of the + // tuple (%v2) and Reg == Z2_Z10, then we need to make sure that + // Z0_Z8, Z1_Z9 and Z3_Z11 are also available. + auto IsFreeConsecutiveReg = [&](unsigned UseOp) { + unsigned R = Reg - (OpIdx - 1) + UseOp; + return StridedRC->contains(R) && + (UseOp == 0 || + ((getSubReg(R, AArch64::zsub0) - AArch64::Z0) == + (getSubReg(R - 1, AArch64::zsub0) - AArch64::Z0) + 1)) && + !Matrix->isPhysRegUsed(R); + }; + if (all_of(iota_range(0U, UseOps, /*Inclusive=*/false), + IsFreeConsecutiveReg)) + Hints.push_back(Reg); } } else { // At least one operand already has a physical register assigned. From a651b436d914ae36f141a6ed73261120a855f0d8 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Thu, 30 Jan 2025 14:19:22 +0000 Subject: [PATCH 7/7] - Moved MRI def after if condition for hasSME() --- llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index b0f9f4ecdf351..49f6860346fa1 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1098,7 +1098,6 @@ bool AArch64RegisterInfo::getRegAllocationHints( Register VirtReg, ArrayRef Order, SmallVectorImpl &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { - const MachineRegisterInfo &MRI = MF.getRegInfo(); auto &ST = MF.getSubtarget(); if (!ST.hasSME() || !ST.isStreaming()) @@ -1113,6 +1112,7 @@ bool AArch64RegisterInfo::getRegAllocationHints( // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy // instructions over reducing the number of clobbered callee-save registers, // so we add the strided registers as a hint. + const MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned RegID = MRI.getRegClass(VirtReg)->getID(); if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID || RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {