From 68e37b28c712a3ecaf130131ea8e90b263b62d6f Mon Sep 17 00:00:00 2001 From: Raphael Moreira Zinsly Date: Tue, 10 Dec 2024 17:50:50 -0300 Subject: [PATCH 1/3] [RISCV] Add stack clash vector support Use the probe loop structure to allocate vector code in the stack as well. We add the pseudo instruction RISCV::PROBED_STACKALLOC_RVV to differentiate from the normal loop. --- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 148 +++++-- llvm/lib/Target/RISCV/RISCVFrameLowering.h | 11 + llvm/lib/Target/RISCV/RISCVInstrInfo.td | 4 + .../RISCV/rvv/access-fixed-objects-by-rvv.ll | 46 ++ .../CodeGen/RISCV/rvv/stack-probing-rvv.ll | 400 ++++++++++++++++++ 5 files changed, 585 insertions(+), 24 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 1028149bf513f..04f841d589ce8 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -499,6 +499,54 @@ getPushOrLibCallsSavedInfo(const MachineFunction &MF, return PushOrLibCallsCSI; } +void RISCVFrameLowering::allocateAndProbeStackForRVV( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount, + MachineInstr::MIFlag Flag, bool EmitCFI) const { + assert(Amount != 0 && "Did not need to adjust stack pointer for RVV."); + + // Emit a variable-length allocation probing loop. + + // Get VLEN in TargetReg + const RISCVInstrInfo *TII = STI.getInstrInfo(); + Register TargetReg = RISCV::X6; + uint32_t NumOfVReg = Amount / (RISCV::RVVBitsPerBlock / 8); + BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoReadVLENB), TargetReg) + .setMIFlag(Flag); + TII->mulImm(MF, MBB, MBBI, DL, TargetReg, NumOfVReg, Flag); + + if (EmitCFI) { + // Set the CFA register to TargetReg. + unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(TargetReg, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, -Amount)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + + // It will be expanded to a probe loop in `inlineStackProbe`. + BuildMI(MBB, MBBI, DL, TII->get(RISCV::PROBED_STACKALLOC_RVV)) + .addReg(SPReg) + .addReg(TargetReg); + + if (EmitCFI) { + // Set the CFA register back to SP. + unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(SPReg, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + + // SUB SP, SP, T1 + BuildMI(MBB, MBBI, DL, TII->get(RISCV::SUB), SPReg) + .addReg(SPReg) + .addReg(TargetReg) + .setMIFlag(Flag); +} + static void appendScalableVectorExpression(const TargetRegisterInfo &TRI, SmallVectorImpl &Expr, int FixedOffset, int ScalableOffset, @@ -857,10 +905,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } + uint64_t SecondSPAdjustAmount = 0; // Emit the second SP adjustment after saving callee saved registers. if (FirstSPAdjustAmount) { - uint64_t SecondSPAdjustAmount = - getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount; + SecondSPAdjustAmount = getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount; assert(SecondSPAdjustAmount > 0 && "SecondSPAdjustAmount should be greater than zero"); @@ -870,11 +918,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, } if (RVVStackSize) { - // We must keep the stack pointer aligned through any intermediate - // updates. - RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, - StackOffset::getScalable(-RVVStackSize), - MachineInstr::FrameSetup, getStackAlign()); + if (NeedProbe) + allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize, + MachineInstr::FrameSetup, !hasFP(MF)); + else + // We must keep the stack pointer aligned through any intermediate + // updates. + RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, + StackOffset::getScalable(-RVVStackSize), + MachineInstr::FrameSetup, getStackAlign()); if (!hasFP(MF)) { // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb". @@ -914,6 +966,19 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, .addImm(ShiftAmount) .setMIFlag(MachineInstr::FrameSetup); } + if (NeedProbe && RVVStackSize == 0) { + // Do a probe if the align + size allocated just passed the probe size + // and was not yet probed. + if (SecondSPAdjustAmount < ProbeSize && + SecondSPAdjustAmount + MaxAlignment.value() >= ProbeSize) { + bool IsRV64 = STI.is64Bit(); + BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW)) + .addReg(RISCV::X0) + .addReg(SPReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + } // FP will be used to restore the frame in the epilogue, so we need // another base register BP to record SP after re-alignment. SP will // track the current stack after allocating variable sized objects. @@ -2016,9 +2081,11 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const { } // Synthesize the probe loop. -static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - DebugLoc DL) { +MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, Register TargetReg, + bool IsRVV) const { + assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP"); auto &Subtarget = MF.getSubtarget(); const RISCVInstrInfo *TII = Subtarget.getInstrInfo(); @@ -2034,7 +2101,6 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); MF.insert(MBBInsertPoint, ExitMBB); MachineInstr::MIFlag Flags = MachineInstr::FrameSetup; - Register TargetReg = RISCV::X6; Register ScratchReg = RISCV::X7; // ScratchReg = ProbeSize @@ -2055,12 +2121,29 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, .addImm(0) .setMIFlags(Flags); - // BNE SP, TargetReg, LoopTest - BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE)) - .addReg(SPReg) - .addReg(TargetReg) - .addMBB(LoopTestMBB) - .setMIFlags(Flags); + if (IsRVV) { + // SUB TargetReg, TargetReg, ProbeSize + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB), + TargetReg) + .addReg(TargetReg) + .addReg(ScratchReg) + .setMIFlags(Flags); + + // BGE TargetReg, ProbeSize, LoopTest + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BGE)) + .addReg(TargetReg) + .addReg(ScratchReg) + .addMBB(LoopTestMBB) + .setMIFlags(Flags); + + } else { + // BNE SP, TargetReg, LoopTest + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE)) + .addReg(SPReg) + .addReg(TargetReg) + .addMBB(LoopTestMBB) + .setMIFlags(Flags); + } ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); @@ -2069,16 +2152,33 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, MBB.addSuccessor(LoopTestMBB); // Update liveins. fullyRecomputeLiveIns({ExitMBB, LoopTestMBB}); + + return ExitMBB; } void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF, MachineBasicBlock &MBB) const { - auto Where = llvm::find_if(MBB, [](MachineInstr &MI) { - return MI.getOpcode() == RISCV::PROBED_STACKALLOC; - }); - if (Where != MBB.end()) { - DebugLoc DL = MBB.findDebugLoc(Where); - emitStackProbeInline(MF, MBB, Where, DL); - Where->eraseFromParent(); + // Get the instructions that need to be replaced. We emit at most two of + // these. Remember them in order to avoid complications coming from the need + // to traverse the block while potentially creating more blocks. + SmallVector ToReplace; + for (MachineInstr &MI : MBB) { + int Opc = MI.getOpcode(); + if (Opc == RISCV::PROBED_STACKALLOC || + Opc == RISCV::PROBED_STACKALLOC_RVV) { + ToReplace.push_back(&MI); + } + } + + for (MachineInstr *MI : ToReplace) { + if (MI->getOpcode() == RISCV::PROBED_STACKALLOC || + MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV) { + MachineBasicBlock::iterator MBBI = MI->getIterator(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + Register TargetReg = MI->getOperand(1).getReg(); + emitStackProbeInline(MF, MBB, MBBI, DL, TargetReg, + (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV)); + MBBI->eraseFromParent(); + } } } diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index 190c063d9d3b5..1a2c6e0302623 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -83,6 +83,12 @@ class RISCVFrameLowering : public TargetFrameLowering { uint64_t RealStackSize, bool EmitCFI, bool NeedProbe, uint64_t ProbeSize) const; + MachineBasicBlock *emitStackProbeInline(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, Register TargetReg, + bool IsRVV) const; + protected: const RISCVSubtarget &STI; @@ -107,6 +113,11 @@ class RISCVFrameLowering : public TargetFrameLowering { // Replace a StackProbe stub (if any) with the actual probe code inline void inlineStackProbe(MachineFunction &MF, MachineBasicBlock &PrologueMBB) const override; + void allocateAndProbeStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, int64_t Amount, + MachineInstr::MIFlag Flag, + bool EmitCFI) const; }; } // namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 14b571cebe1fe..d77e416a970b2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1382,6 +1382,10 @@ def PROBED_STACKALLOC : Pseudo<(outs GPR:$sp), (ins GPR:$scratch), []>, Sched<[]>; +def PROBED_STACKALLOC_RVV : Pseudo<(outs GPR:$sp), + (ins GPR:$scratch), + []>, + Sched<[]>; } /// HI and ADD_LO address nodes. diff --git a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll index c6a3649c9ba8f..0052f4b9c041e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll @@ -64,3 +64,49 @@ define @access_fixed_and_vector_objects(ptr %val) { ret %a } + +define @probe_fixed_and_vector_objects(ptr %val, %dummy) "probe-stack"="inline-asm" { +; RV64IV-LABEL: probe_fixed_and_vector_objects: +; RV64IV: # %bb.0: +; RV64IV-NEXT: addi sp, sp, -528 +; RV64IV-NEXT: .cfi_def_cfa_offset 528 +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: .cfi_def_cfa t1, -8 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB2_1 +; RV64IV-NEXT: # %bb.2: +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x04, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 528 + 1 * vlenb +; RV64IV-NEXT: addi a0, sp, 8 +; RV64IV-NEXT: vl1re64.v v9, (a0) +; RV64IV-NEXT: addi a0, sp, 528 +; RV64IV-NEXT: vl1re64.v v10, (a0) +; RV64IV-NEXT: ld a0, 520(sp) +; RV64IV-NEXT: vsetvli zero, a0, e64, m1, tu, ma +; RV64IV-NEXT: vadd.vv v8, v9, v10 +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 528 +; RV64IV-NEXT: addi sp, sp, 528 +; RV64IV-NEXT: .cfi_def_cfa_offset 0 +; RV64IV-NEXT: ret + %local = alloca i64 + %vector = alloca + %array = alloca [64 x i64] + %v1 = load , ptr %array + %v2 = load , ptr %vector + %len = load i64, ptr %local + + %a = call @llvm.riscv.vadd.nxv1i64.nxv1i64( + %dummy, + %v1, + %v2, + i64 %len) + + ret %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll new file mode 100644 index 0000000000000..d7f9ae73eaea5 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll @@ -0,0 +1,400 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \ +; RUN: | FileCheck %s -check-prefix=RV64IV +; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \ +; RUN: | FileCheck %s -check-prefix=RV32IV + +; Tests adapted from AArch64. + +; Test prolog sequences for stack probing when vector is involved. + +; The space for vector objects needs probing in the general case, because +; the stack adjustment may happen to be too big (i.e. greater than the +; probe size). + +define void @f_vector(ptr %out) #0 { +; RV64IV-LABEL: f_vector: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: slli t1, t1, 1 +; RV64IV-NEXT: .cfi_def_cfa t1, -16 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB0_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB0_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: slli a0, a0, 1 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f_vector: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: slli t1, t1, 1 +; RV32IV-NEXT: .cfi_def_cfa t1, -16 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB0_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB0_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: slli a0, a0, 1 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 0 +; RV32IV-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +; As above, but with 4 vectors of stack space. +define void @f4_vector(ptr %out) #0 { +; RV64IV-LABEL: f4_vector: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: slli t1, t1, 3 +; RV64IV-NEXT: .cfi_def_cfa t1, -64 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB1_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB1_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: slli a0, a0, 3 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f4_vector: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: slli t1, t1, 3 +; RV32IV-NEXT: .cfi_def_cfa t1, -64 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB1_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB1_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: slli a0, a0, 3 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 0 +; RV32IV-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + ret void +} + +; As above, but with 16 vectors of stack space. +; The stack adjustment is less than or equal to 16 x 256 = 4096, so +; we can allocate the locals at once. +define void @f16_vector(ptr %out) #0 { +; RV64IV-LABEL: f16_vector: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: slli t1, t1, 5 +; RV64IV-NEXT: .cfi_def_cfa t1, -256 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB2_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB2_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: slli a0, a0, 5 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f16_vector: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: slli t1, t1, 5 +; RV32IV-NEXT: .cfi_def_cfa t1, -256 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB2_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB2_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: slli a0, a0, 5 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 0 +; RV32IV-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + %vec6 = alloca , align 16 + %vec7 = alloca , align 16 + %vec8 = alloca , align 16 + %vec9 = alloca , align 16 + %vec10 = alloca , align 16 + %vec11 = alloca , align 16 + %vec12 = alloca , align 16 + %vec13 = alloca , align 16 + %vec14 = alloca , align 16 + %vec15 = alloca , align 16 + %vec16 = alloca , align 16 + ret void +} + +; As above, but with 17 vectors of stack space. +define void @f17_vector(ptr %out) #0 { +; RV64IV-LABEL: f17_vector: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: li a0, 34 +; RV64IV-NEXT: mul t1, t1, a0 +; RV64IV-NEXT: .cfi_def_cfa t1, -272 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB3_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB3_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: li a1, 34 +; RV64IV-NEXT: mul a0, a0, a1 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f17_vector: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: li a0, 34 +; RV32IV-NEXT: mul t1, t1, a0 +; RV32IV-NEXT: .cfi_def_cfa t1, -272 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB3_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB3_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: li a1, 34 +; RV32IV-NEXT: mul a0, a0, a1 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 0 +; RV32IV-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + %vec6 = alloca , align 16 + %vec7 = alloca , align 16 + %vec8 = alloca , align 16 + %vec9 = alloca , align 16 + %vec10 = alloca , align 16 + %vec11 = alloca , align 16 + %vec12 = alloca , align 16 + %vec13 = alloca , align 16 + %vec14 = alloca , align 16 + %vec15 = alloca , align 16 + %vec16 = alloca , align 16 + %vec17 = alloca , align 16 + ret void +} + +; A vector and a 16-byte fixed size object. +define void @f1_vector_16_arr(ptr %out) #0 { +; RV64IV-LABEL: f1_vector_16_arr: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: addi sp, sp, -16 +; RV64IV-NEXT: .cfi_def_cfa_offset 16 +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: slli t1, t1, 1 +; RV64IV-NEXT: .cfi_def_cfa t1, -16 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB4_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB4_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: slli a0, a0, 1 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 16 +; RV64IV-NEXT: addi sp, sp, 16 +; RV64IV-NEXT: .cfi_def_cfa_offset 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f1_vector_16_arr: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: addi sp, sp, -16 +; RV32IV-NEXT: .cfi_def_cfa_offset 16 +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: slli t1, t1, 1 +; RV32IV-NEXT: .cfi_def_cfa t1, -16 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB4_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB4_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: slli a0, a0, 1 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 16 +; RV32IV-NEXT: addi sp, sp, 16 +; RV32IV-NEXT: .cfi_def_cfa_offset 0 +; RV32IV-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 1 + ret void +} + +; A large vector object and a large slot, both of which need probing. +define void @f1_vector_4096_arr(ptr %out) #0 { +; RV64IV-LABEL: f1_vector_4096_arr: +; RV64IV: # %bb.0: # %entry +; RV64IV-NEXT: lui a0, 1 +; RV64IV-NEXT: sub sp, sp, a0 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: .cfi_def_cfa_offset 4096 +; RV64IV-NEXT: lui a0, 1 +; RV64IV-NEXT: sub sp, sp, a0 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: .cfi_def_cfa_offset 8192 +; RV64IV-NEXT: lui a0, 1 +; RV64IV-NEXT: sub sp, sp, a0 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: .cfi_def_cfa_offset 12288 +; RV64IV-NEXT: addi sp, sp, -16 +; RV64IV-NEXT: .cfi_def_cfa_offset 12304 +; RV64IV-NEXT: csrr t1, vlenb +; RV64IV-NEXT: slli t1, t1, 7 +; RV64IV-NEXT: .cfi_def_cfa t1, -1024 +; RV64IV-NEXT: lui t2, 1 +; RV64IV-NEXT: .LBB5_1: # %entry +; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IV-NEXT: sub sp, sp, t2 +; RV64IV-NEXT: sd zero, 0(sp) +; RV64IV-NEXT: sub t1, t1, t2 +; RV64IV-NEXT: bge t1, t2, .LBB5_1 +; RV64IV-NEXT: # %bb.2: # %entry +; RV64IV-NEXT: .cfi_def_cfa_register sp +; RV64IV-NEXT: sub sp, sp, t1 +; RV64IV-NEXT: .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb +; RV64IV-NEXT: csrr a0, vlenb +; RV64IV-NEXT: slli a0, a0, 7 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa sp, 12304 +; RV64IV-NEXT: lui a0, 3 +; RV64IV-NEXT: addiw a0, a0, 16 +; RV64IV-NEXT: add sp, sp, a0 +; RV64IV-NEXT: .cfi_def_cfa_offset 0 +; RV64IV-NEXT: ret +; +; RV32IV-LABEL: f1_vector_4096_arr: +; RV32IV: # %bb.0: # %entry +; RV32IV-NEXT: lui a0, 1 +; RV32IV-NEXT: sub sp, sp, a0 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: .cfi_def_cfa_offset 4096 +; RV32IV-NEXT: lui a0, 1 +; RV32IV-NEXT: sub sp, sp, a0 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: .cfi_def_cfa_offset 8192 +; RV32IV-NEXT: lui a0, 1 +; RV32IV-NEXT: sub sp, sp, a0 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: .cfi_def_cfa_offset 12288 +; RV32IV-NEXT: addi sp, sp, -16 +; RV32IV-NEXT: .cfi_def_cfa_offset 12304 +; RV32IV-NEXT: csrr t1, vlenb +; RV32IV-NEXT: slli t1, t1, 7 +; RV32IV-NEXT: .cfi_def_cfa t1, -1024 +; RV32IV-NEXT: lui t2, 1 +; RV32IV-NEXT: .LBB5_1: # %entry +; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IV-NEXT: sub sp, sp, t2 +; RV32IV-NEXT: sw zero, 0(sp) +; RV32IV-NEXT: sub t1, t1, t2 +; RV32IV-NEXT: bge t1, t2, .LBB5_1 +; RV32IV-NEXT: # %bb.2: # %entry +; RV32IV-NEXT: .cfi_def_cfa_register sp +; RV32IV-NEXT: sub sp, sp, t1 +; RV32IV-NEXT: .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb +; RV32IV-NEXT: csrr a0, vlenb +; RV32IV-NEXT: slli a0, a0, 7 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa sp, 12304 +; RV32IV-NEXT: lui a0, 3 +; RV32IV-NEXT: addi a0, a0, 16 +; RV32IV-NEXT: add sp, sp, a0 +; RV32IV-NEXT: .cfi_def_cfa_offset 0 +; RV32IV-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 12288, align 1 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } From 5a54950e292e7458964eb7ae27d83d5f6b9cbbbd Mon Sep 17 00:00:00 2001 From: Raphael Moreira Zinsly Date: Wed, 18 Dec 2024 11:50:11 -0300 Subject: [PATCH 2/3] Add align test and fix types --- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 11 ++- llvm/lib/Target/RISCV/RISCVFrameLowering.h | 6 -- .../CodeGen/RISCV/stack-clash-prologue.ll | 68 +++++++++++++++++++ 3 files changed, 72 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 04f841d589ce8..504c7936b3288 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -2081,10 +2081,9 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const { } // Synthesize the probe loop. -MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline( - MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL, Register TargetReg, - bool IsRVV) const { +static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + Register TargetReg, bool IsRVV) { assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP"); auto &Subtarget = MF.getSubtarget(); @@ -2152,8 +2151,6 @@ MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline( MBB.addSuccessor(LoopTestMBB); // Update liveins. fullyRecomputeLiveIns({ExitMBB, LoopTestMBB}); - - return ExitMBB; } void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF, @@ -2163,7 +2160,7 @@ void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF, // to traverse the block while potentially creating more blocks. SmallVector ToReplace; for (MachineInstr &MI : MBB) { - int Opc = MI.getOpcode(); + unsigned Opc = MI.getOpcode(); if (Opc == RISCV::PROBED_STACKALLOC || Opc == RISCV::PROBED_STACKALLOC_RVV) { ToReplace.push_back(&MI); diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index 1a2c6e0302623..26d2a26d681c3 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -83,12 +83,6 @@ class RISCVFrameLowering : public TargetFrameLowering { uint64_t RealStackSize, bool EmitCFI, bool NeedProbe, uint64_t ProbeSize) const; - MachineBasicBlock *emitStackProbeInline(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - DebugLoc DL, Register TargetReg, - bool IsRVV) const; - protected: const RISCVSubtarget &STI; diff --git a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll index 18af080e86747..843e57a42d926 100644 --- a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll +++ b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll @@ -538,4 +538,72 @@ define i32 @f9(i64 %i) local_unnamed_addr #0 { ret i32 %c } +; alloca < probe_size, align < probe_size, alloca + align > probe_size +define i32 @f10(i64 %i) local_unnamed_addr #0 { +; RV64I-LABEL: f10: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -2032 +; RV64I-NEXT: .cfi_def_cfa_offset 2032 +; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 2016(sp) # 8-byte Folded Spill +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: addi s0, sp, 2032 +; RV64I-NEXT: .cfi_def_cfa s0, 0 +; RV64I-NEXT: addi sp, sp, -2048 +; RV64I-NEXT: addi sp, sp, -1040 +; RV64I-NEXT: andi sp, sp, -1024 +; RV64I-NEXT: sd zero, 0(sp) +; RV64I-NEXT: slli a0, a0, 2 +; RV64I-NEXT: addi a1, sp, 1024 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: sw a1, 0(a0) +; RV64I-NEXT: lw a0, 1024(sp) +; RV64I-NEXT: addi sp, s0, -2032 +; RV64I-NEXT: .cfi_def_cfa sp, 2032 +; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 2016(sp) # 8-byte Folded Reload +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: addi sp, sp, 2032 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV32I-LABEL: f10: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -2032 +; RV32I-NEXT: .cfi_def_cfa_offset 2032 +; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 2024(sp) # 4-byte Folded Spill +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: addi s0, sp, 2032 +; RV32I-NEXT: .cfi_def_cfa s0, 0 +; RV32I-NEXT: addi sp, sp, -2048 +; RV32I-NEXT: addi sp, sp, -1040 +; RV32I-NEXT: andi sp, sp, -1024 +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: slli a0, a0, 2 +; RV32I-NEXT: addi a1, sp, 1024 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: li a1, 1 +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: lw a0, 1024(sp) +; RV32I-NEXT: addi sp, s0, -2032 +; RV32I-NEXT: .cfi_def_cfa sp, 2032 +; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 2024(sp) # 4-byte Folded Reload +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: addi sp, sp, 2032 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret + %a = alloca i32, i32 1000, align 1024 + %b = getelementptr inbounds i32, ptr %a, i64 %i + store volatile i32 1, ptr %b + %c = load volatile i32, ptr %a + ret i32 %c +} + attributes #0 = { "probe-stack"="inline-asm" } From 4fb0ef061fc1bd2fc8e7c33cab77723030f353e6 Mon Sep 17 00:00:00 2001 From: Raphael Moreira Zinsly Date: Mon, 6 Jan 2025 17:57:15 -0300 Subject: [PATCH 3/3] Add braces as requested --- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 504c7936b3288..ed5bce35bcc0f 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -918,15 +918,16 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, } if (RVVStackSize) { - if (NeedProbe) + if (NeedProbe) { allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize, MachineInstr::FrameSetup, !hasFP(MF)); - else + } else { // We must keep the stack pointer aligned through any intermediate // updates. RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getScalable(-RVVStackSize), MachineInstr::FrameSetup, getStackAlign()); + } if (!hasFP(MF)) { // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb".