diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4ede1fb93fe5f..4bdfff1f47ed0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8763,17 +8763,9 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { bool shouldUseFormStridedPseudo(MachineInstr &MI) { MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - const TargetRegisterClass *RegClass = nullptr; - switch (MI.getOpcode()) { - case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO: - RegClass = &AArch64::ZPR2StridedOrContiguousRegClass; - break; - case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO: - RegClass = &AArch64::ZPR4StridedOrContiguousRegClass; - break; - default: - llvm_unreachable("Unexpected opcode."); - } + assert((MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || + MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) && + "Unexpected opcode."); MCRegister SubReg = MCRegister::NoRegister; for (unsigned I = 1; I < MI.getNumOperands(); ++I) { @@ -8790,8 +8782,11 @@ bool shouldUseFormStridedPseudo(MachineInstr &MI) { SubReg = OpSubReg; MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg()); + const TargetRegisterClass *CopySrcClass = + MRI.getRegClass(CopySrcOp->getReg()); if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg || - MRI.getRegClass(CopySrcOp->getReg()) != RegClass) + (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass && + CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass)) return false; } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 5973b63b5a802..49f6860346fa1 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -20,6 +20,7 @@ #include "MCTargetDesc/AArch64InstPrinter.h" #include "llvm/ADT/BitVector.h" #include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -1097,7 +1098,11 @@ bool AArch64RegisterInfo::getRegAllocationHints( Register VirtReg, ArrayRef Order, SmallVectorImpl &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { - const MachineRegisterInfo &MRI = MF.getRegInfo(); + + auto &ST = MF.getSubtarget(); + if (!ST.hasSME() || !ST.isStreaming()) + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, + VRM); // The SVE calling convention preserves registers Z8-Z23. As a result, there // are no ZPR2Strided or ZPR4Strided registers that do not overlap with the @@ -1107,26 +1112,127 @@ bool AArch64RegisterInfo::getRegAllocationHints( // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy // instructions over reducing the number of clobbered callee-save registers, // so we add the strided registers as a hint. + const MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned RegID = MRI.getRegClass(VirtReg)->getID(); - // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE. - if ((RegID == AArch64::ZPR2StridedOrContiguousRegClassID || - RegID == AArch64::ZPR4StridedOrContiguousRegClassID) && - any_of(MRI.use_nodbg_instructions(VirtReg), [](const MachineInstr &Use) { - return Use.getOpcode() == - AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO || - Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO; - })) { - const TargetRegisterClass *StridedRC = - RegID == AArch64::ZPR2StridedOrContiguousRegClassID - ? &AArch64::ZPR2StridedRegClass - : &AArch64::ZPR4StridedRegClass; - - for (MCPhysReg Reg : Order) - if (StridedRC->contains(Reg)) - Hints.push_back(Reg); + if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID || + RegID == AArch64::ZPR4StridedOrContiguousRegClassID) { + + // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE. + for (const MachineInstr &Use : MRI.use_nodbg_instructions(VirtReg)) { + if (Use.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO && + Use.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) + continue; + + unsigned UseOps = Use.getNumOperands() - 1; + const TargetRegisterClass *StridedRC; + switch (RegID) { + case AArch64::ZPR2StridedOrContiguousRegClassID: + StridedRC = &AArch64::ZPR2StridedRegClass; + break; + case AArch64::ZPR4StridedOrContiguousRegClassID: + StridedRC = &AArch64::ZPR4StridedRegClass; + break; + default: + llvm_unreachable("Unexpected RegID"); + } - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, - VRM); + SmallVector StridedOrder; + for (MCPhysReg Reg : Order) + if (StridedRC->contains(Reg)) + StridedOrder.push_back(Reg); + + int OpIdx = Use.findRegisterUseOperandIdx(VirtReg, this); + assert(OpIdx != -1 && "Expected operand index from register use."); + + unsigned TupleID = MRI.getRegClass(Use.getOperand(0).getReg())->getID(); + bool IsMulZPR = TupleID == AArch64::ZPR2Mul2RegClassID || + TupleID == AArch64::ZPR4Mul4RegClassID; + + const MachineOperand *AssignedRegOp = llvm::find_if( + make_range(Use.operands_begin() + 1, Use.operands_end()), + [&VRM](const MachineOperand &Op) { + return VRM->hasPhys(Op.getReg()); + }); + + // Example: + // + // When trying to find a suitable register allocation for VirtReg %v2 in: + // + // %v0:zpr2stridedorcontiguous = ld1 p0/z, [...] + // %v1:zpr2stridedorcontiguous = ld1 p0/z, [...] + // %v2:zpr2stridedorcontiguous = ld1 p0/z, [...] + // %v3:zpr2stridedorcontiguous = ld1 p0/z, [...] + // %v4:zpr4mul4 = FORM_TRANSPOSED_X4 %v0:0, %v1:0, %v2:0, %v3:0 + // + // One such suitable allocation would be: + // + // { z0, z8 } = ld1 p0/z, [...] + // { z1, z9 } = ld1 p0/z, [...] + // { z2, z10 } = ld1 p0/z, [...] + // { z3, z11 } = ld1 p0/z, [...] + // { z0, z1, z2, z3 } = + // FORM_TRANSPOSED_X4 {z0, z8}:0, {z1, z9}:0, {z2, z10}:0, {z3, z11}:0 + // + // Below we distinguish two cases when trying to find a register: + // * None of the registers used by FORM_TRANSPOSED_X4 have been assigned + // yet. In this case the code muse ensure that there are at least UseOps + // free consecutive registers. If IsMulZPR is true, then the first of + // registers must also be a multiple of UseOps, e.g. { z0, z1, z2, z3 } + // is valid but { z1, z2, z3, z5 } is not. + // * One or more of the registers used by FORM_TRANSPOSED_X4 is already + // assigned a physical register, which means only checking that a + // consectutive range of free tuple registers exists which includes + // the assigned register. + // e.g. in the example above, if { z0, z8 } is already allocated for + // %v0, we just need to ensure that { z1, z9 }, { z2, z10 } and + // { z3, z11 } are also free. If so, we add { z2, z10 }. + + if (AssignedRegOp == Use.operands_end()) { + // There are no registers already assigned to any of the pseudo + // operands. Look for a valid starting register for the group. + for (unsigned I = 0; I < StridedOrder.size(); ++I) { + MCPhysReg Reg = StridedOrder[I]; + SmallVector Regs; + + // If the FORM_TRANSPOSE nodes use the ZPRMul classes, the starting + // register of the first load should be a multiple of 2 or 4. + unsigned SubRegIdx = Use.getOperand(OpIdx).getSubReg(); + if (IsMulZPR && (getSubReg(Reg, SubRegIdx) - AArch64::Z0) % UseOps != + ((unsigned)OpIdx - 1)) + continue; + + // In the example above, if VirtReg is the third operand of the + // tuple (%v2) and Reg == Z2_Z10, then we need to make sure that + // Z0_Z8, Z1_Z9 and Z3_Z11 are also available. + auto IsFreeConsecutiveReg = [&](unsigned UseOp) { + unsigned R = Reg - (OpIdx - 1) + UseOp; + return StridedRC->contains(R) && + (UseOp == 0 || + ((getSubReg(R, AArch64::zsub0) - AArch64::Z0) == + (getSubReg(R - 1, AArch64::zsub0) - AArch64::Z0) + 1)) && + !Matrix->isPhysRegUsed(R); + }; + if (all_of(iota_range(0U, UseOps, /*Inclusive=*/false), + IsFreeConsecutiveReg)) + Hints.push_back(Reg); + } + } else { + // At least one operand already has a physical register assigned. + // Find the starting sub-register of this and use it to work out the + // correct strided register to suggest based on the current op index. + MCPhysReg TargetStartReg = + getSubReg(VRM->getPhys(AssignedRegOp->getReg()), AArch64::zsub0) + + (OpIdx - AssignedRegOp->getOperandNo()); + + for (unsigned I = 0; I < StridedOrder.size(); ++I) + if (getSubReg(StridedOrder[I], AArch64::zsub0) == TargetStartReg) + Hints.push_back(StridedOrder[I]); + } + + if (!Hints.empty()) + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, + MF, VRM); + } } for (MachineInstr &MI : MRI.def_instructions(VirtReg)) { diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index 967d168593a40..d8d796e392b23 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -354,6 +354,53 @@ entry: ret void } +define void @udot_single_za32_u16_vg1x2_x4load_x2tuple(ptr %ptr, i64 %stride, %zn) #0 { +; CHECK-LABEL: udot_single_za32_u16_vg1x2_x4load_x2tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: add x9, x0, x1 +; CHECK-NEXT: str z14, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str z13, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z1.h, z5.h, z9.h, z13.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z2.h, z6.h, z10.h, z14.h }, pn8/z, [x9] +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z0.h +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z5.h, z6.h }, z0.h +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z9.h, z10.h }, z0.h +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z13.h, z14.h }, z0.h +; CHECK-NEXT: ldr z14, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , , , } %1, 0 + %3 = extractvalue { , , , } %1, 1 + %4 = extractvalue { , , , } %1, 2 + %5 = extractvalue { , , , } %1, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %6 = tail call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) + %7 = extractvalue { , , , } %6, 0 + %8 = extractvalue { , , , } %6, 1 + %9 = extractvalue { , , , } %6, 2 + %10 = extractvalue { , , , } %6, 3 + call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, %2, %7, %zn) + call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, %3, %8, %zn) + call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, %4, %9, %zn) + call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, %5, %10, %zn) + ret void +} + define void @udot_single_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, %zn4) #0 { ; CHECK-LABEL: udot_single_za32_u16_vg1x4: ; CHECK: // %bb.0: @@ -371,47 +418,39 @@ define void @udot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] -; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1097,6 +1114,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1158,33 +1176,35 @@ define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1219,6 +1239,59 @@ entry: tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr + ret void +} + +define void @udot_single_za32_u16_vg1x4_x2load_x4tuple(ptr %ptr, i64 %stride, %zn) #0 { +; CHECK-LABEL: udot_single_za32_u16_vg1x4_x2load_x4tuple: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: lsl x9, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z12, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: str z9, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: add x10, x9, x1 +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z4.b, z12.b }, pn8/z, [x0, x10] +; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z0.b +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z9.b - z12.b }, z0.b +; CHECK-NEXT: ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() + %1 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) + %2 = extractvalue { , } %1, 0 + %3 = extractvalue { , } %1, 1 + %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride + %4 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) + %5 = extractvalue { , } %4, 0 + %6 = extractvalue { , } %4, 1 + %mul3 = shl i64 %stride, 1 + %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 + %7 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) + %8 = extractvalue { , } %7, 0 + %9 = extractvalue { , } %7, 1 + %mul5 = mul i64 %stride, 3 + %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 + %10 = tail call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) + %11 = extractvalue { , } %10, 0 + %12 = extractvalue { , } %10, 1 + call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 0, %2, %5, %8, %11, %zn) + call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 0, %3, %6, %9, %12, %zn) ret void } @@ -1322,15 +1395,17 @@ define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] -; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1345,6 +1420,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1406,33 +1482,35 @@ define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1467,6 +1545,7 @@ entry: tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1572,15 +1651,17 @@ define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] -; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1595,6 +1676,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1656,33 +1738,35 @@ define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1717,6 +1801,7 @@ entry: tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1822,15 +1907,17 @@ define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] -; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z2.b, z10.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z3.b, z11.b }, pn8/z, [x0, x1] +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z10.b, z11.b }, z0.b[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1845,6 +1932,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -1906,33 +1994,35 @@ define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1967,6 +2057,7 @@ entry: tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll index e7d1050b60799..63851dd857f97 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll @@ -99,7 +99,7 @@ entry: ret void } -define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: svdot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill @@ -107,16 +107,18 @@ define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] -; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9] -; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] -; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z3.h, z11.h }, pn8/z, [x9] +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z10.h, z11.h }, z0.h[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -131,6 +133,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -184,7 +187,7 @@ entry: ret void } -define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: svdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill @@ -192,33 +195,35 @@ define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -253,6 +258,7 @@ entry: tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -322,7 +328,7 @@ entry: ret void } -define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: uvdot_form_2x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill @@ -330,16 +336,18 @@ define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: add x9, x0, x1 -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] -; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0] -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x0] +; CHECK-NEXT: ld1h { z3.h, z11.h }, pn8/z, [x9] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z10.h, z11.h }, z0.h[0] +; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -354,6 +362,7 @@ entry: %6 = extractvalue { , } %4, 1 tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %2, %5, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, %3, %6, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -407,7 +416,7 @@ entry: ret void } -define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: uvdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill @@ -415,33 +424,35 @@ define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, ; CHECK-NEXT: lsl x9, x1, #1 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue pn8.b -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: add x10, x9, x1 -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] -; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1] -; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9] -; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0] -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] +; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] +; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] +; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: addvl sp, sp, #9 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -476,6 +487,7 @@ entry: tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -544,7 +556,7 @@ entry: ret void } -define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: suvdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill @@ -552,33 +564,35 @@ define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } @@ -681,7 +696,7 @@ entry: ret void } -define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { +define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %scalable_arg) #0 { ; CHECK-LABEL: usvdot_form_4x_tuple_svecc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill @@ -689,33 +704,35 @@ define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, %3, %8, %13, %18, poison, i32 0) tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %4, %9, %14, %19, poison, i32 0) tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, %5, %10, %15, %20, poison, i32 0) + store %scalable_arg, ptr %ptr ret void } diff --git a/llvm/test/CodeGen/AArch64/sme2-multivec-regalloc.mir b/llvm/test/CodeGen/AArch64/sme2-multivec-regalloc.mir new file mode 100644 index 0000000000000..1d04cc6d7ca2a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-multivec-regalloc.mir @@ -0,0 +1,184 @@ +# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs -enable-subreg-liveness -start-before=greedy %s -o - | FileCheck %s + +# No available group of four strided x4 registers, fall back on default allocation order +--- +name: form_4x_tuple_many_live +tracksRegLiveness: true +stack: + - { id: 0, name: '', type: default, offset: 0, size: 32, alignment: 16, + stack-id: scalable-vector, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.0.entry: + liveins: $x0, $x1, $z0, $z17 + + ; CHECK-LABEL: form_4x_tuple_many_live + ; CHECK: stp d11, d10, [sp, #-48]! + ; CHECK-NEXT: stp d9, d8, [sp, #16] + ; CHECK-NEXT: str x29, [sp, #32] + ; CHECK-NEXT: addvl sp, sp, #-2 + ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG + ; CHECK-NEXT: .cfi_offset w29, -16 + ; CHECK-NEXT: .cfi_offset b8, -24 + ; CHECK-NEXT: .cfi_offset b9, -32 + ; CHECK-NEXT: .cfi_offset b10, -40 + ; CHECK-NEXT: .cfi_offset b11, -48 + ; CHECK-NEXT: lsl x9, x1, #1 + ; CHECK-NEXT: ptrue pn8.b + ; CHECK-NEXT: mov w8, wzr + ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] + ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x1] + ; CHECK-NEXT: ptrue p0.b + ; CHECK-NEXT: add x10, x9, x1 + ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x9] + ; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x10] + ; CHECK-NEXT: mov z8.d, z16.d + ; CHECK-NEXT: mov z9.d, z18.d + ; CHECK-NEXT: mov z21.d, z22.d + ; CHECK-NEXT: mov z10.d, z19.d + ; CHECK-NEXT: mov z22.d, z23.d + ; CHECK-NEXT: mov z25.d, z26.d + ; CHECK-NEXT: mov z11.d, z4.d + ; CHECK-NEXT: mov z23.d, z5.d + ; CHECK-NEXT: mov z26.d, z27.d + ; CHECK-NEXT: mov z27.d, z6.d + ; CHECK-NEXT: mov z29.d, z30.d + ; CHECK-NEXT: mov z30.d, z31.d + ; CHECK-NEXT: mov z31.d, z7.d + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] + ; CHECK-NEXT: st1b { z0.b }, p0, [x0] + ; CHECK-NEXT: st1b { z17.b }, p0, [x0] + ; CHECK-NEXT: addvl sp, sp, #2 + ; CHECK-NEXT: ldp d9, d8, [sp, #16] + ; CHECK-NEXT: ldr x29, [sp, #32] + ; CHECK-NEXT: ldp d11, d10, [sp], #48 + ; CHECK-NEXT: ret + + %0:gpr64common = COPY $x0 + %1:gpr64 = COPY $x1 + %2:zpr = COPY $z0 + %3:zpr = COPY $z17 + %5:matrixindexgpr32_8_11 = COPY $wzr + %6:gpr64 = UBFMXri %1, 63, 62 + %pred:pnr_p8to15 = PTRUE_C_B implicit $vg + %7:ppr_3b = PTRUE_B 31, implicit $vg + %8:gpr64 = ADDXrr %6, %1 + %9:zpr4stridedorcontiguous = LD1B_4Z_IMM_PSEUDO %pred, %0, 0 + %10:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %1 + %11:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %6 + %12:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %8 + %13:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub0, %10.zsub0, %11.zsub0, %12.zsub0 + %14:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub1, %10.zsub1, %11.zsub1, %12.zsub1 + %15:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub2, %10.zsub2, %11.zsub2, %12.zsub2 + %16:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %9.zsub3, %10.zsub3, %11.zsub3, %12.zsub3 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %13, undef %28:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %14, undef %30:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %15, undef %32:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %16, undef %34:zpr_4b, 0 + ST1B_IMM %2, %7, %0, 0 :: (store () into %stack.0) + ST1B_IMM %3, %7, %0, 0 :: (store () into %stack.0) + RET_ReallyLR +... + +# First multi-vector load to be allocated is not the first operand of the FORM_TRANSPOSED pseudo +--- +name: form_4x_tuple_allocation_order +tracksRegLiveness: true +stack: + - { id: 0, name: '', type: default, offset: 0, size: 32, alignment: 16, + stack-id: scalable-vector, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.0.entry: + liveins: $x0, $x1, $z0 + + ; CHECK: str x29, [sp, #-16]! + ; CHECK-NEXT: addvl sp, sp, #-2 + ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG + ; CHECK-NEXT: .cfi_offset w29, -16 + ; CHECK-NEXT: lsl x9, x1, #1 + ; CHECK-NEXT: ptrue pn8.b + ; CHECK-NEXT: mov w8, wzr + ; CHECK-NEXT: ptrue p0.b + ; CHECK-NEXT: add x10, x9, x1 + ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] + ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] + ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] + ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] + ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] + ; CHECK-NEXT: st1b { z0.b }, p0, [x0] + ; CHECK-NEXT: addvl sp, sp, #2 + ; CHECK-NEXT: ldr x29, [sp], #16 + ; CHECK-NEXT: ret + + %0:gpr64common = COPY $x0 + %1:gpr64 = COPY $x1 + %2:zpr = COPY $z0 + %5:matrixindexgpr32_8_11 = COPY $wzr + %6:gpr64 = UBFMXri %1, 63, 62 + %pred:pnr_p8to15 = PTRUE_C_B implicit $vg + %7:ppr_3b = PTRUE_B 31, implicit $vg + %8:gpr64 = ADDXrr %6, %1 + %9:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %8 + %10:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %6 + %11:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %1 + %12:zpr4stridedorcontiguous = LD1B_4Z_IMM_PSEUDO %pred, %0, 0 + %13:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub0, %11.zsub0, %10.zsub0, %9.zsub0 + %14:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub1, %11.zsub1, %10.zsub1, %9.zsub1 + %15:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub2, %11.zsub2, %10.zsub2, %9.zsub2 + %16:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %12.zsub3, %11.zsub3, %10.zsub3, %9.zsub3 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %13, undef %28:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %14, undef %30:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %15, undef %32:zpr_4b, 0 + $za = UDOT_VG4_M4ZZI_BtoS $za, %5, 0, %16, undef %34:zpr_4b, 0 + ST1B_IMM %2, %7, %0, 0 :: (store () into %stack.0) + RET_ReallyLR +... + +# Strided order is [ $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 ] +# Ensure we don't allocate $z23_z31 & $z0_z8 although they are consecutive +--- + name: udot_form_2x_tuple_live_reg_order + tracksRegLiveness: true + body: | + bb.0.entry: + liveins: $x0, $x1, $z16, $z17, $z18, $z19, $z20, $z21, $z22 + + ; CHECK: stp d9, d8, [sp, #-16]! + ; CHECK-NEXT: .cfi_def_cfa_offset 16 + ; CHECK-NEXT: .cfi_offset b8, -8 + ; CHECK-NEXT: .cfi_offset b9, -16 + ; CHECK-NEXT: ptrue pn8.b + ; CHECK-NEXT: mov w8, wzr + ; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] + ; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1] + ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b + ; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b + ; CHECK-NEXT: ldp d9, d8, [sp], #16 + ; CHECK-NEXT: ret + + %0:gpr64 = COPY $x1 + %1:gpr64common = COPY $x0 + %2:zpr = COPY $z16 + %3:zpr = COPY $z17 + %4:zpr = COPY $z18 + %5:zpr = COPY $z19 + %6:zpr = COPY $z20 + %7:zpr = COPY $z21 + %8:zpr = COPY $z22 + %9:matrixindexgpr32_8_11 = COPY $wzr + %10:pnr_p8to15 = PTRUE_C_B implicit $vg + %11:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO %10, %1, 0 + %12:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO %10, %1, %0 + %13:zpr2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %11.zsub0, %12.zsub0 + %14:zpr2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %11.zsub1, %12.zsub1 + $za = UDOT_VG2_M2ZZ_BtoS $za, %9, 0, %13, undef %15:zpr_4b + $za = UDOT_VG2_M2ZZ_BtoS $za, %9, 0, %14, undef %16:zpr_4b + RET_ReallyLR +...