diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 1028149bf513f..ed5bce35bcc0f 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -499,6 +499,54 @@ getPushOrLibCallsSavedInfo(const MachineFunction &MF, return PushOrLibCallsCSI; } +void RISCVFrameLowering::allocateAndProbeStackForRVV( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount, + MachineInstr::MIFlag Flag, bool EmitCFI) const { + assert(Amount != 0 && "Did not need to adjust stack pointer for RVV."); + + // Emit a variable-length allocation probing loop. + + // Get VLEN in TargetReg + const RISCVInstrInfo *TII = STI.getInstrInfo(); + Register TargetReg = RISCV::X6; + uint32_t NumOfVReg = Amount / (RISCV::RVVBitsPerBlock / 8); + BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoReadVLENB), TargetReg) + .setMIFlag(Flag); + TII->mulImm(MF, MBB, MBBI, DL, TargetReg, NumOfVReg, Flag); + + if (EmitCFI) { + // Set the CFA register to TargetReg. + unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(TargetReg, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, -Amount)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + + // It will be expanded to a probe loop in `inlineStackProbe`. + BuildMI(MBB, MBBI, DL, TII->get(RISCV::PROBED_STACKALLOC_RVV)) + .addReg(SPReg) + .addReg(TargetReg); + + if (EmitCFI) { + // Set the CFA register back to SP. + unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(SPReg, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + + // SUB SP, SP, T1 + BuildMI(MBB, MBBI, DL, TII->get(RISCV::SUB), SPReg) + .addReg(SPReg) + .addReg(TargetReg) + .setMIFlag(Flag); +} + static void appendScalableVectorExpression(const TargetRegisterInfo &TRI, SmallVectorImpl &Expr, int FixedOffset, int ScalableOffset, @@ -857,10 +905,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } + uint64_t SecondSPAdjustAmount = 0; // Emit the second SP adjustment after saving callee saved registers. if (FirstSPAdjustAmount) { - uint64_t SecondSPAdjustAmount = - getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount; + SecondSPAdjustAmount = getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount; assert(SecondSPAdjustAmount > 0 && "SecondSPAdjustAmount should be greater than zero"); @@ -870,11 +918,16 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, } if (RVVStackSize) { - // We must keep the stack pointer aligned through any intermediate - // updates. - RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, - StackOffset::getScalable(-RVVStackSize), - MachineInstr::FrameSetup, getStackAlign()); + if (NeedProbe) { + allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize, + MachineInstr::FrameSetup, !hasFP(MF)); + } else { + // We must keep the stack pointer aligned through any intermediate + // updates. + RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, + StackOffset::getScalable(-RVVStackSize), + MachineInstr::FrameSetup, getStackAlign()); + } if (!hasFP(MF)) { // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb". @@ -914,6 +967,19 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, .addImm(ShiftAmount) .setMIFlag(MachineInstr::FrameSetup); } + if (NeedProbe && RVVStackSize == 0) { + // Do a probe if the align + size allocated just passed the probe size + // and was not yet probed. + if (SecondSPAdjustAmount < ProbeSize && + SecondSPAdjustAmount + MaxAlignment.value() >= ProbeSize) { + bool IsRV64 = STI.is64Bit(); + BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW)) + .addReg(RISCV::X0) + .addReg(SPReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + } // FP will be used to restore the frame in the epilogue, so we need // another base register BP to record SP after re-alignment. SP will // track the current stack after allocating variable sized objects. @@ -2017,8 +2083,9 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const { // Synthesize the probe loop. static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - DebugLoc DL) { + MachineBasicBlock::iterator MBBI, DebugLoc DL, + Register TargetReg, bool IsRVV) { + assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP"); auto &Subtarget = MF.getSubtarget(); const RISCVInstrInfo *TII = Subtarget.getInstrInfo(); @@ -2034,7 +2101,6 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); MF.insert(MBBInsertPoint, ExitMBB); MachineInstr::MIFlag Flags = MachineInstr::FrameSetup; - Register TargetReg = RISCV::X6; Register ScratchReg = RISCV::X7; // ScratchReg = ProbeSize @@ -2055,12 +2121,29 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, .addImm(0) .setMIFlags(Flags); - // BNE SP, TargetReg, LoopTest - BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE)) - .addReg(SPReg) - .addReg(TargetReg) - .addMBB(LoopTestMBB) - .setMIFlags(Flags); + if (IsRVV) { + // SUB TargetReg, TargetReg, ProbeSize + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB), + TargetReg) + .addReg(TargetReg) + .addReg(ScratchReg) + .setMIFlags(Flags); + + // BGE TargetReg, ProbeSize, LoopTest + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BGE)) + .addReg(TargetReg) + .addReg(ScratchReg) + .addMBB(LoopTestMBB) + .setMIFlags(Flags); + + } else { + // BNE SP, TargetReg, LoopTest + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE)) + .addReg(SPReg) + .addReg(TargetReg) + .addMBB(LoopTestMBB) + .setMIFlags(Flags); + } ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); @@ -2073,12 +2156,27 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF, MachineBasicBlock &MBB) const { - auto Where = llvm::find_if(MBB, [](MachineInstr &MI) { - return MI.getOpcode() == RISCV::PROBED_STACKALLOC; - }); - if (Where != MBB.end()) { - DebugLoc DL = MBB.findDebugLoc(Where); - emitStackProbeInline(MF, MBB, Where, DL); - Where->eraseFromParent(); + // Get the instructions that need to be replaced. We emit at most two of + // these. Remember them in order to avoid complications coming from the need + // to traverse the block while potentially creating more blocks. + SmallVector ToReplace; + for (MachineInstr &MI : MBB) { + unsigned Opc = MI.getOpcode(); + if (Opc == RISCV::PROBED_STACKALLOC || + Opc == RISCV::PROBED_STACKALLOC_RVV) { + ToReplace.push_back(&MI); + } + } + + for (MachineInstr *MI : ToReplace) { + if (MI->getOpcode() == RISCV::PROBED_STACKALLOC || + MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV) { + MachineBasicBlock::iterator MBBI = MI->getIterator(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + Register TargetReg = MI->getOperand(1).getReg(); + emitStackProbeInline(MF, MBB, MBBI, DL, TargetReg, + (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV)); + MBBI->eraseFromParent(); + } } } diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index 190c063d9d3b5..26d2a26d681c3 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -107,6 +107,11 @@ class RISCVFrameLowering : public TargetFrameLowering { // Replace a StackProbe stub (if any) with the actual probe code inline void inlineStackProbe(MachineFunction &MF, MachineBasicBlock &PrologueMBB) const override; + void allocateAndProbeStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, int64_t Amount, + MachineInstr::MIFlag Flag, + bool EmitCFI) const; }; } // namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 14b571cebe1fe..d77e416a970b2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1382,6 +1382,10 @@ def PROBED_STACKALLOC : Pseudo<(outs GPR:$sp), (ins GPR:$scratch), []>, Sched<[]>; +def PROBED_STACKALLOC_RVV : Pseudo<(outs GPR:$sp), + (ins GPR:$scratch), + []>, + Sched<[]>; } /// HI and ADD_LO address nodes. diff --git a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll index c6a3649c9ba8f..0052f4b9c041e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll @@ -64,3 +64,49 @@ define @access_fixed_and_vector_objects(ptr %val) { ret %a } + +define @probe_fixed_and_vector_objects(ptr %val, %dummy) "probe-stack"="inline-asm" { +; RV64IV-LABEL: probe_fixed_and_vector_objects: +; RV64IV: # %bb.0: +; RV64IV-NEXT: addi sp, sp, -528 +; RV64IV-NEXT: .cfi_def_cfa_offset 528 +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: .cfi_def_cfa t1, -8 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB2_1 +; RV64IV-NEXT: # %bb.2: +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x04, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 528 + 1 * vlenb +; RV64IV-NEXT: addi a0, sp, 8 +; RV64IV-NEXT: vl1re64.v v9, (a0) +; RV64IV-NEXT: addi a0, sp, 528 +; RV64IV-NEXT: vl1re64.v v10, (a0) +; RV64IV-NEXT: ld a0, 520(sp) +; RV64IV-NEXT: vsetvli zero, a0, e64, m1, tu, ma +; RV64IV-NEXT: vadd.vv v8, v9, v10 +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 528 +; RV64IV-NEXT: addi sp, sp, 528 +; RV64IV-NEXT: .cfi_def_cfa_offset 0 +; RV64IV-NEXT: ret + %local = alloca i64 + %vector = alloca + %array = alloca [64 x i64] + %v1 = load , ptr %array + %v2 = load , ptr %vector + %len = load i64, ptr %local + + %a = call @llvm.riscv.vadd.nxv1i64.nxv1i64( + %dummy, + %v1, + %v2, + i64 %len) + + ret %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll new file mode 100644 index 0000000000000..d7f9ae73eaea5 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll @@ -0,0 +1,400 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \ +; RUN: | FileCheck %s -check-prefix=RV64IV +; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \ +; RUN: | FileCheck %s -check-prefix=RV32IV + +; Tests adapted from AArch64. + +; Test prolog sequences for stack probing when vector is involved. + +; The space for vector objects needs probing in the general case, because +; the stack adjustment may happen to be too big (i.e. greater than the +; probe size). + +define void @f_vector(ptr %out) #0 { +; RV64IV-LABEL: f_vector: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: slli t1, t1, 1 +; RV64IV-NEXT: .cfi_def_cfa t1, -16 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB0_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB0_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: slli a0, a0, 1 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f_vector: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: slli t1, t1, 1 +; RV32IV-NEXT: .cfi_def_cfa t1, -16 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB0_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB0_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: slli a0, a0, 1 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 0 +; RV32IV-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +; As above, but with 4 vectors of stack space. +define void @f4_vector(ptr %out) #0 { +; RV64IV-LABEL: f4_vector: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: slli t1, t1, 3 +; RV64IV-NEXT: .cfi_def_cfa t1, -64 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB1_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB1_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: slli a0, a0, 3 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f4_vector: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: slli t1, t1, 3 +; RV32IV-NEXT: .cfi_def_cfa t1, -64 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB1_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB1_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: slli a0, a0, 3 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 0 +; RV32IV-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + ret void +} + +; As above, but with 16 vectors of stack space. +; The stack adjustment is less than or equal to 16 x 256 = 4096, so +; we can allocate the locals at once. +define void @f16_vector(ptr %out) #0 { +; RV64IV-LABEL: f16_vector: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: slli t1, t1, 5 +; RV64IV-NEXT: .cfi_def_cfa t1, -256 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB2_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB2_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: slli a0, a0, 5 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f16_vector: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: slli t1, t1, 5 +; RV32IV-NEXT: .cfi_def_cfa t1, -256 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB2_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB2_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: slli a0, a0, 5 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 0 +; RV32IV-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + %vec6 = alloca , align 16 + %vec7 = alloca , align 16 + %vec8 = alloca , align 16 + %vec9 = alloca , align 16 + %vec10 = alloca , align 16 + %vec11 = alloca , align 16 + %vec12 = alloca , align 16 + %vec13 = alloca , align 16 + %vec14 = alloca , align 16 + %vec15 = alloca , align 16 + %vec16 = alloca , align 16 + ret void +} + +; As above, but with 17 vectors of stack space. +define void @f17_vector(ptr %out) #0 { +; RV64IV-LABEL: f17_vector: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: li a0, 34 +; RV64IV-NEXT: mul t1, t1, a0 +; RV64IV-NEXT: .cfi_def_cfa t1, -272 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB3_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB3_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: li a1, 34 +; RV64IV-NEXT: mul a0, a0, a1 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f17_vector: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: li a0, 34 +; RV32IV-NEXT: mul t1, t1, a0 +; RV32IV-NEXT: .cfi_def_cfa t1, -272 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB3_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB3_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: li a1, 34 +; RV32IV-NEXT: mul a0, a0, a1 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 0 +; RV32IV-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + %vec6 = alloca , align 16 + %vec7 = alloca , align 16 + %vec8 = alloca , align 16 + %vec9 = alloca , align 16 + %vec10 = alloca , align 16 + %vec11 = alloca , align 16 + %vec12 = alloca , align 16 + %vec13 = alloca , align 16 + %vec14 = alloca , align 16 + %vec15 = alloca , align 16 + %vec16 = alloca , align 16 + %vec17 = alloca , align 16 + ret void +} + +; A vector and a 16-byte fixed size object. +define void @f1_vector_16_arr(ptr %out) #0 { +; RV64IV-LABEL: f1_vector_16_arr: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: addi sp, sp, -16 +; RV64IV-NEXT: .cfi_def_cfa_offset 16 +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: slli t1, t1, 1 +; RV64IV-NEXT: .cfi_def_cfa t1, -16 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB4_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB4_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: slli a0, a0, 1 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 16 +; RV64IV-NEXT: addi sp, sp, 16 +; RV64IV-NEXT: .cfi_def_cfa_offset 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f1_vector_16_arr: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: addi sp, sp, -16 +; RV32IV-NEXT: .cfi_def_cfa_offset 16 +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: slli t1, t1, 1 +; RV32IV-NEXT: .cfi_def_cfa t1, -16 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB4_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB4_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: slli a0, a0, 1 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 16 +; RV32IV-NEXT: addi sp, sp, 16 +; RV32IV-NEXT: .cfi_def_cfa_offset 0 +; RV32IV-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 1 + ret void +} + +; A large vector object and a large slot, both of which need probing. +define void @f1_vector_4096_arr(ptr %out) #0 { +; RV64IV-LABEL: f1_vector_4096_arr: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: lui a0, 1 +; RV64IV-NEXT: sub sp, sp, a0 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: .cfi_def_cfa_offset 4096 +; RV64IV-NEXT: lui a0, 1 +; RV64IV-NEXT: sub sp, sp, a0 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: .cfi_def_cfa_offset 8192 +; RV64IV-NEXT: lui a0, 1 +; RV64IV-NEXT: sub sp, sp, a0 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: .cfi_def_cfa_offset 12288 +; RV64IV-NEXT: addi sp, sp, -16 +; RV64IV-NEXT: .cfi_def_cfa_offset 12304 +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: slli t1, t1, 7 +; RV64IV-NEXT: .cfi_def_cfa t1, -1024 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB5_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB5_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: slli a0, a0, 7 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 12304 +; RV64IV-NEXT: lui a0, 3 +; RV64IV-NEXT: addiw a0, a0, 16 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa_offset 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f1_vector_4096_arr: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: lui a0, 1 +; RV32IV-NEXT: sub sp, sp, a0 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: .cfi_def_cfa_offset 4096 +; RV32IV-NEXT: lui a0, 1 +; RV32IV-NEXT: sub sp, sp, a0 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: .cfi_def_cfa_offset 8192 +; RV32IV-NEXT: lui a0, 1 +; RV32IV-NEXT: sub sp, sp, a0 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: .cfi_def_cfa_offset 12288 +; RV32IV-NEXT: addi sp, sp, -16 +; RV32IV-NEXT: .cfi_def_cfa_offset 12304 +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: slli t1, t1, 7 +; RV32IV-NEXT: .cfi_def_cfa t1, -1024 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB5_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB5_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: slli a0, a0, 7 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 12304 +; RV32IV-NEXT: lui a0, 3 +; RV32IV-NEXT: addi a0, a0, 16 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa_offset 0 +; RV32IV-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 12288, align 1 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } diff --git a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll index 18af080e86747..843e57a42d926 100644 --- a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll +++ b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll @@ -538,4 +538,72 @@ define i32 @f9(i64 %i) local_unnamed_addr #0 { ret i32 %c } +; alloca < probe_size, align < probe_size, alloca + align > probe_size +define i32 @f10(i64 %i) local_unnamed_addr #0 { +; RV64I-LABEL: f10: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -2032 +; RV64I-NEXT: .cfi_def_cfa_offset 2032 +; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 2016(sp) # 8-byte Folded Spill +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: addi s0, sp, 2032 +; RV64I-NEXT: .cfi_def_cfa s0, 0 +; RV64I-NEXT: addi sp, sp, -2048 +; RV64I-NEXT: addi sp, sp, -1040 +; RV64I-NEXT: andi sp, sp, -1024 +; RV64I-NEXT: sd zero, 0(sp) +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: addi a1, sp, 1024 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: sw a1, 0(a0) +; RV64I-NEXT: lw a0, 1024(sp) +; RV64I-NEXT: addi sp, s0, -2032 +; RV64I-NEXT: .cfi_def_cfa sp, 2032 +; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 2016(sp) # 8-byte Folded Reload +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: addi sp, sp, 2032 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV32I-LABEL: f10: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -2032 +; RV32I-NEXT: .cfi_def_cfa_offset 2032 +; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 2024(sp) # 4-byte Folded Spill +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: addi s0, sp, 2032 +; RV32I-NEXT: .cfi_def_cfa s0, 0 +; RV32I-NEXT: addi sp, sp, -2048 +; RV32I-NEXT: addi sp, sp, -1040 +; RV32I-NEXT: andi sp, sp, -1024 +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: addi a1, sp, 1024 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: li a1, 1 +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: lw a0, 1024(sp) +; RV32I-NEXT: addi sp, s0, -2032 +; RV32I-NEXT: .cfi_def_cfa sp, 2032 +; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 2024(sp) # 4-byte Folded Reload +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: addi sp, sp, 2032 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret + %a = alloca i32, i32 1000, align 1024 + %b = getelementptr inbounds i32, ptr %a, i64 %i + store volatile i32 1, ptr %b + %c = load volatile i32, ptr %a + ret i32 %c +} + attributes #0 = { "probe-stack"="inline-asm" }