diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 351235dd5bcdd..df50375ea3763 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2940,6 +2940,63 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI, return NextInst->getParent(); } +MachineBasicBlock * +AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI, + MachineBasicBlock *MBB) const { + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass; + const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass; + + Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR); + Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src + Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR); + Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + + // RDVL requires GPR64, ADDSVL requires GPR64sp + // We need to insert COPY instructions, these will later be removed by the + // RegisterCoalescer + BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1); + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp) + .addReg(RegVL_GPR); + + BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp) + .addReg(RegVL_GPRsp) + .addImm(-1); + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR) + .addReg(RegSVL_GPRsp); + + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction::iterator It = ++MBB->getIterator(); + MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, TrapBB); + MF->insert(It, PassBB); + + // Continue if vector lengths match + BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX)) + .addReg(RegSVL_GPR) + .addMBB(PassBB); + + // Transfer rest of current BB to PassBB + PassBB->splice(PassBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + PassBB->transferSuccessorsAndUpdatePHIs(MBB); + + // Trap if vector lengths mismatch + BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1); + + MBB->addSuccessor(TrapBB); + MBB->addSuccessor(PassBB); + + MI.eraseFromParent(); + return PassBB; +} + MachineBasicBlock * AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, @@ -3343,6 +3400,9 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case AArch64::PROBED_STACKALLOC_DYN: return EmitDynamicProbedAlloc(MI, BB); + case AArch64::CHECK_MATCHING_VL_PSEUDO: + return EmitCheckMatchingVL(MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_B: return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); case AArch64::LD1_MXIPXX_H_PSEUDO_H: @@ -9113,14 +9173,29 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, } } -SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL, - bool Enable, SDValue Chain, - SDValue InGlue, - unsigned Condition) const { +SDValue AArch64TargetLowering::changeStreamingMode( + SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, + unsigned Condition, bool InsertVectorLengthCheck) const { MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setHasStreamingModeChanges(true); + auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue { + SmallVector Ops = {Chain}; + if (InGlue) + Ops.push_back(InGlue); + return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL, + DAG.getVTList(MVT::Other, MVT::Glue), Ops); + }; + + if (InsertVectorLengthCheck && Enable) { + // Non-streaming -> Streaming + // Insert vector length check before smstart + SDValue CheckVL = GetCheckVL(Chain, InGlue); + Chain = CheckVL.getValue(0); + InGlue = CheckVL.getValue(1); + } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()); SDValue MSROp = @@ -9147,7 +9222,16 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL, if (InGlue) Ops.push_back(InGlue); - return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); + SDValue SMChange = + DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); + + if (!InsertVectorLengthCheck || Enable) + return SMChange; + + // Streaming -> Non-streaming + // Insert vector length check after smstop since we cannot read VL + // in streaming mode + return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1)); } // Emit a call to __arm_sme_save or __arm_sme_restore. @@ -9730,9 +9814,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue InGlue; if (RequiresSMChange) { - Chain = - changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(), - Chain, InGlue, getSMToggleCondition(CallAttrs)); + bool InsertVectorLengthCheck = + (CallConv == CallingConv::AArch64_SVE_VectorCall); + Chain = changeStreamingMode( + DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue, + getSMToggleCondition(CallAttrs), InsertVectorLengthCheck); InGlue = Chain.getValue(1); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index f5d14905cac66..ff073d3eafb1f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -168,6 +168,9 @@ class AArch64TargetLowering : public TargetLowering { MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitCheckMatchingVL(MachineInstr &MI, + MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const; @@ -532,8 +535,8 @@ class AArch64TargetLowering : public TargetLowering { /// node. \p Condition should be one of the enum values from /// AArch64SME::ToggleCondition. SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, - SDValue Chain, SDValue InGlue, - unsigned Condition) const; + SDValue Chain, SDValue InGlue, unsigned Condition, + bool InsertVectorLengthCheck = false) const; bool isVScaleKnownToBeAPowerOfTwo() const override { return true; } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 601dc34d74b9c..430b7382de216 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -48,6 +48,17 @@ let usesCustomInserter = 1 in { } def : Pat<(i64 (AArch64EntryPStateSM)), (EntryPStateSM)>; +// Pseudo-instruction that compares the current SVE vector length (VL) with the +// streaming vector length (SVL). If the two lengths do not match, the check +// lowers to a `brk`, causing a trap. +let hasSideEffects = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in +def CHECK_MATCHING_VL_PSEUDO : Pseudo<(outs), (ins), []>, Sched<[]>; + +def AArch64_check_matching_vl + : SDNode<"AArch64ISD::CHECK_MATCHING_VL", SDTypeProfile<0, 0,[]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def : Pat<(AArch64_check_matching_vl), (CHECK_MATCHING_VL_PSEUDO)>; + //===----------------------------------------------------------------------===// // Old SME ABI lowering ISD nodes/pseudos (deprecated) //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll index cf42db7aa65bd..b58a857f3a3cb 100644 --- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll +++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll @@ -47,12 +47,18 @@ define void @fbyte( %v) #0{ ; NOPAIR-NEXT: // %bb.1: ; NOPAIR-NEXT: smstop sm ; NOPAIR-NEXT: .LBB0_2: +; NOPAIR-NEXT: rdvl x8, #1 +; NOPAIR-NEXT: addsvl x8, x8, #-1 +; NOPAIR-NEXT: cbz x8, .LBB0_4 +; NOPAIR-NEXT: // %bb.3: +; NOPAIR-NEXT: brk #0x1 +; NOPAIR-NEXT: .LBB0_4: ; NOPAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; NOPAIR-NEXT: bl my_func2 -; NOPAIR-NEXT: tbz w19, #0, .LBB0_4 -; NOPAIR-NEXT: // %bb.3: +; NOPAIR-NEXT: tbz w19, #0, .LBB0_6 +; NOPAIR-NEXT: // %bb.5: ; NOPAIR-NEXT: smstart sm -; NOPAIR-NEXT: .LBB0_4: +; NOPAIR-NEXT: .LBB0_6: ; NOPAIR-NEXT: addvl sp, sp, #1 ; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload @@ -127,12 +133,18 @@ define void @fbyte( %v) #0{ ; PAIR-NEXT: // %bb.1: ; PAIR-NEXT: smstop sm ; PAIR-NEXT: .LBB0_2: +; PAIR-NEXT: rdvl x8, #1 +; PAIR-NEXT: addsvl x8, x8, #-1 +; PAIR-NEXT: cbz x8, .LBB0_4 +; PAIR-NEXT: // %bb.3: +; PAIR-NEXT: brk #0x1 +; PAIR-NEXT: .LBB0_4: ; PAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; PAIR-NEXT: bl my_func2 -; PAIR-NEXT: tbz w19, #0, .LBB0_4 -; PAIR-NEXT: // %bb.3: +; PAIR-NEXT: tbz w19, #0, .LBB0_6 +; PAIR-NEXT: // %bb.5: ; PAIR-NEXT: smstart sm -; PAIR-NEXT: .LBB0_4: +; PAIR-NEXT: .LBB0_6: ; PAIR-NEXT: addvl sp, sp, #1 ; PAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload ; PAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index 80827c2547780..1659b217ce0be 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -527,14 +527,24 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbnz x8, .LBB14_2 +; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: bl callee_farg_fret ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm ; CHECK-NEXT: smstop sm +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB14_3 +; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB14_3: ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl callee_farg_fret ; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll new file mode 100644 index 0000000000000..0ac46085d683f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll @@ -0,0 +1,213 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -stop-before=finalize-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-BEFORE-ISEL +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-AFTER-ISEL + +target triple = "aarch64-unknown-linux-gnu" + +declare void @bar_enabled() #0 +declare void @bar() +declare @bar_retv_enabled() #0 +declare @bar_retv() + +; Non-streaming -> calls streaming callee +define void @foo_non_streaming_pass_arg(ptr %arg) { + ; CHECK-BEFORE-ISEL-LABEL: name: foo_non_streaming_pass_arg + ; CHECK-BEFORE-ISEL: bb.0.entry: + ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0 + ; CHECK-BEFORE-ISEL-NEXT: {{ $}} + ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-BEFORE-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load () from %ir.arg) + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]] + ; CHECK-BEFORE-ISEL-NEXT: BL @bar_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR + ; + ; CHECK-AFTER-ISEL-LABEL: name: foo_non_streaming_pass_arg + ; CHECK-AFTER-ISEL: bb.0.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-AFTER-ISEL-NEXT: liveins: $x0 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-AFTER-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load () from %ir.arg) + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: BRK 1 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.2.entry: + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]] + ; CHECK-AFTER-ISEL-NEXT: BL @bar_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR +entry: + %v = load , ptr %arg, align 16 + tail call void @bar_enabled( %v) #0 + ret void +} + +; Streaming -> calls non-streaming callee +define void @foo_streaming_pass_arg(ptr %arg) #0 { + ; CHECK-BEFORE-ISEL-LABEL: name: foo_streaming_pass_arg + ; CHECK-BEFORE-ISEL: bb.0.entry: + ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0 + ; CHECK-BEFORE-ISEL-NEXT: {{ $}} + ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-BEFORE-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load () from %ir.arg) + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO + ; CHECK-BEFORE-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]] + ; CHECK-BEFORE-ISEL-NEXT: BL @bar, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR + ; + ; CHECK-AFTER-ISEL-LABEL: name: foo_streaming_pass_arg + ; CHECK-AFTER-ISEL: bb.0.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-AFTER-ISEL-NEXT: liveins: $x0 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-AFTER-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load () from %ir.arg) + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: BRK 1 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.2.entry: + ; CHECK-AFTER-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]] + ; CHECK-AFTER-ISEL-NEXT: BL @bar, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR +entry: + %v = load , ptr %arg, align 16 + tail call void @bar( %v) + ret void +} + +; Non-streaming -> returns SVE value from streaming callee +define void @foo_non_streaming_retval(ptr %ptr) { + ; CHECK-BEFORE-ISEL-LABEL: name: foo_non_streaming_retval + ; CHECK-BEFORE-ISEL: bb.0.entry: + ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0 + ; CHECK-BEFORE-ISEL-NEXT: {{ $}} + ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: BL @bar_retv_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-BEFORE-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store () into %ir.ptr) + ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR + ; + ; CHECK-AFTER-ISEL-LABEL: name: foo_non_streaming_retval + ; CHECK-AFTER-ISEL: bb.0.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-AFTER-ISEL-NEXT: liveins: $x0 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: BRK 1 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.2.entry: + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: BL @bar_retv_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: [[COPY3:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY3]] + ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY4]], [[COPY]], 0 :: (store () into %ir.ptr) + ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR +entry: + %v = tail call @bar_retv_enabled() #0 + store %v, ptr %ptr, align 16 + ret void +} + +; Streaming -> returns SVE value from non-streaming callee +define void @foo_streaming_retval(ptr %ptr) #0 { + ; CHECK-BEFORE-ISEL-LABEL: name: foo_streaming_retval + ; CHECK-BEFORE-ISEL: bb.0.entry: + ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0 + ; CHECK-BEFORE-ISEL-NEXT: {{ $}} + ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO + ; CHECK-BEFORE-ISEL-NEXT: BL @bar_retv, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 + ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-BEFORE-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-BEFORE-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]] + ; CHECK-BEFORE-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store () into %ir.ptr) + ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR + ; + ; CHECK-AFTER-ISEL-LABEL: name: foo_streaming_retval + ; CHECK-AFTER-ISEL: bb.0.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-AFTER-ISEL-NEXT: liveins: $x0 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]] + ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg + ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]] + ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.1.entry: + ; CHECK-AFTER-ISEL-NEXT: successors: + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: BRK 1 + ; CHECK-AFTER-ISEL-NEXT: {{ $}} + ; CHECK-AFTER-ISEL-NEXT: bb.2.entry: + ; CHECK-AFTER-ISEL-NEXT: BL @bar_retv, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0 + ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-AFTER-ISEL-NEXT: [[COPY3:%[0-9]+]]:zpr = COPY $z0 + ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-AFTER-ISEL-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY3]] + ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY4]], [[COPY]], 0 :: (store () into %ir.ptr) + ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR +entry: + %v = tail call @bar_retv() + store %v, ptr %ptr, align 16 + ret void +} + +attributes #0 = { "aarch64_pstate_sm_enabled" } +attributes #1 = { "aarch64_pstate_sm_compatible" } diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll new file mode 100644 index 0000000000000..a1eb1ceeaf19b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll @@ -0,0 +1,478 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +declare void @bar_enabled() #0 +declare void @bar() +declare @bar_retv_enabled() #0 +declare @bar_retv() + +; Non-streaming -> calls streaming callee +define void @foo_non_streaming_pass_arg(ptr %arg) { +; CHECK-LABEL: foo_non_streaming_pass_arg: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w28, -8 +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB0_2: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: sub x8, x29, #64 +; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: bl bar_enabled +; CHECK-NEXT: smstop sm +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 96 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = load , ptr %arg, align 16 + tail call void @bar_enabled( %v) #0 + ret void +} + +; Streaming-compatible -> calls streaming callee +define void @foo_streaming_compatible_pass_arg(ptr %arg) #1 { +; CHECK-LABEL: foo_streaming_compatible_pass_arg: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 1136 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #1120] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #1088 +; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w28, -24 +; CHECK-NEXT: .cfi_offset vg, -32 +; CHECK-NEXT: .cfi_offset w30, -40 +; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: .cfi_offset b8, -1080 +; CHECK-NEXT: .cfi_offset b9, -1088 +; CHECK-NEXT: .cfi_offset b10, -1096 +; CHECK-NEXT: .cfi_offset b11, -1104 +; CHECK-NEXT: .cfi_offset b12, -1112 +; CHECK-NEXT: .cfi_offset b13, -1120 +; CHECK-NEXT: .cfi_offset b14, -1128 +; CHECK-NEXT: .cfi_offset b15, -1136 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mrs x19, SVCR +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB1_2: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: sub x8, x29, #1088 +; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: tbnz w19, #0, .LBB1_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB1_4: // %entry +; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: bl bar_enabled +; CHECK-NEXT: tbnz w19, #0, .LBB1_6 +; CHECK-NEXT: // %bb.5: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB1_6: // %entry +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa wsp, 1136 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #1120] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = load , ptr %arg, align 16 + tail call void @bar_enabled( %v) #0 + ret void +} + +; Streaming -> calls non-streaming callee +define void @foo_streaming_pass_arg(ptr %arg) #0 { +; CHECK-LABEL: foo_streaming_pass_arg: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #1120 +; CHECK-NEXT: .cfi_def_cfa_offset 1120 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w28, -8 +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_offset b8, -1064 +; CHECK-NEXT: .cfi_offset b9, -1072 +; CHECK-NEXT: .cfi_offset b10, -1080 +; CHECK-NEXT: .cfi_offset b11, -1088 +; CHECK-NEXT: .cfi_offset b12, -1096 +; CHECK-NEXT: .cfi_offset b13, -1104 +; CHECK-NEXT: .cfi_offset b14, -1112 +; CHECK-NEXT: .cfi_offset b15, -1120 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 2144 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB2_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB2_2: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: bl bar +; CHECK-NEXT: smstart sm +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 1120 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1120 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = load , ptr %arg, align 16 + tail call void @bar( %v) + ret void +} + +; Non-streaming -> returns SVE value from streaming callee +define void @foo_non_streaming_retval(ptr %ptr) { +; CHECK-LABEL: foo_non_streaming_retval: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 112 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: stp x28, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w28, -16 +; CHECK-NEXT: .cfi_offset vg, -32 +; CHECK-NEXT: .cfi_offset w30, -40 +; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: .cfi_offset b8, -56 +; CHECK-NEXT: .cfi_offset b9, -64 +; CHECK-NEXT: .cfi_offset b10, -72 +; CHECK-NEXT: .cfi_offset b11, -80 +; CHECK-NEXT: .cfi_offset b12, -88 +; CHECK-NEXT: .cfi_offset b13, -96 +; CHECK-NEXT: .cfi_offset b14, -104 +; CHECK-NEXT: .cfi_offset b15, -112 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB3_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB3_2: // %entry +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl bar_retv_enabled +; CHECK-NEXT: sub x8, x29, #64 +; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 112 +; CHECK-NEXT: ldp x28, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = tail call @bar_retv_enabled() #0 + store %v, ptr %ptr, align 16 + ret void +} + +; Streaming-compatible -> returns SVE value from streaming callee +define void @foo_streaming_compatible_retval(ptr %ptr) #1 { +; CHECK-LABEL: foo_streaming_compatible_retval: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 1136 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill +; CHECK-NEXT: str x20, [sp, #1120] // 8-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #1128] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #1088 +; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w28, -24 +; CHECK-NEXT: .cfi_offset vg, -32 +; CHECK-NEXT: .cfi_offset w30, -40 +; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: .cfi_offset b8, -1080 +; CHECK-NEXT: .cfi_offset b9, -1088 +; CHECK-NEXT: .cfi_offset b10, -1096 +; CHECK-NEXT: .cfi_offset b11, -1104 +; CHECK-NEXT: .cfi_offset b12, -1112 +; CHECK-NEXT: .cfi_offset b13, -1120 +; CHECK-NEXT: .cfi_offset b14, -1128 +; CHECK-NEXT: .cfi_offset b15, -1136 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mrs x20, SVCR +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB4_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB4_2: // %entry +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: tbnz w20, #0, .LBB4_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB4_4: // %entry +; CHECK-NEXT: bl bar_retv_enabled +; CHECK-NEXT: sub x8, x29, #1088 +; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: tbnz w20, #0, .LBB4_6 +; CHECK-NEXT: // %bb.5: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB4_6: // %entry +; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa wsp, 1136 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #1128] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x20, [sp, #1120] // 8-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w20 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = tail call @bar_retv_enabled() #0 + store %v, ptr %ptr, align 16 + ret void +} + +; Streaming -> returns SVE value from non-streaming callee +define void @foo_streaming_retval(ptr %ptr) #0 { +; CHECK-LABEL: foo_streaming_retval: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 1136 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #1120] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #1088 +; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w28, -24 +; CHECK-NEXT: .cfi_offset vg, -32 +; CHECK-NEXT: .cfi_offset w30, -40 +; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: .cfi_offset b8, -1080 +; CHECK-NEXT: .cfi_offset b9, -1088 +; CHECK-NEXT: .cfi_offset b10, -1096 +; CHECK-NEXT: .cfi_offset b11, -1104 +; CHECK-NEXT: .cfi_offset b12, -1112 +; CHECK-NEXT: .cfi_offset b13, -1120 +; CHECK-NEXT: .cfi_offset b14, -1128 +; CHECK-NEXT: .cfi_offset b15, -1136 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB5_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB5_2: // %entry +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: bl bar_retv +; CHECK-NEXT: sub x8, x29, #1088 +; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: str z0, [x19] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa wsp, 1136 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #1120] // 8-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1136 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +entry: + %v = tail call @bar_retv() + store %v, ptr %ptr, align 16 + ret void +} + +attributes #0 = { "aarch64_pstate_sm_enabled" } +attributes #1 = { "aarch64_pstate_sm_compatible" } diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 9088986ee9b72..f2163ad15bafc 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -209,13 +209,19 @@ define @streaming_compatible_with_scalable_vectors( @streaming_compatible_with_predicate_vectors( %x) #0 { ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: smstop sm +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: addsvl x8, x8, #-1 +; CHECK-NEXT: cbz x8, .LBB3_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: bl scalable_callee ; CHECK-NEXT: smstart sm @@ -472,6 +478,12 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: //APP ; FP-CHECK-NEXT: //NO_APP ; FP-CHECK-NEXT: smstop sm +; FP-CHECK-NEXT: rdvl x8, #1 +; FP-CHECK-NEXT: addsvl x8, x8, #-1 +; FP-CHECK-NEXT: cbz x8, .LBB3_2 +; FP-CHECK-NEXT: // %bb.1: +; FP-CHECK-NEXT: brk #0x1 +; FP-CHECK-NEXT: .LBB3_2: ; FP-CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload ; FP-CHECK-NEXT: bl scalable_callee ; FP-CHECK-NEXT: smstart sm diff --git a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll index a23854759d688..33a4ecd56e35b 100644 --- a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll +++ b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll @@ -2,7 +2,7 @@ ; We should have both spill and reload for %arg. -; CHECK: remark: :0:0: 2 spills 2.000000e+00 total spills cost 3 reloads 3.000000e+00 total reloads cost generated in function +; CHECK: remark: :0:0: 2 spills 1.500000e+00 total spills cost 3 reloads 1.500000e+00 total reloads cost generated in function define @streaming_compatible_with_predicate_vectors( %arg) "aarch64_pstate_sm_compatible" nounwind #0 { %res = call @normal_callee_predicate_vec_arg( %arg) %and = and %res, %arg