diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index df6e8d5bbd50e..ab507e3714ebb 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -6027,8 +6027,13 @@ Frame Pointer If the kernel needs a frame pointer for the reasons defined in ``SIFrameLowering`` then SGPR33 is used and is always set to ``0`` in the -kernel prolog. If a frame pointer is not required then all uses of the frame -pointer are replaced with immediate ``0`` offsets. +kernel prolog. On GFX12+, when dynamic VGPRs are enabled, the prologue will +check if the kernel is running on a compute queue, and if so it will reserve +some scratch space for any dynamic VGPRs that might need to be saved by the +CWSR trap handler. In this case, the frame pointer will be initialized to +a suitably aligned offset above this reserved area. If a frame pointer is not +required then all uses of the frame pointer are replaced with immediate ``0`` +offsets. .. _amdgpu-amdhsa-kernel-prolog-flat-scratch: @@ -17140,33 +17145,35 @@ within a map that has been added by the same *vendor-name*. .. table:: AMDPAL Code Object Hardware Stage Metadata Map :name: amdgpu-amdpal-code-object-hardware-stage-metadata-map-table - ========================== ============== ========= =============================================================== - String Key Value Type Required? Description - ========================== ============== ========= =============================================================== - ".entry_point" string The ELF symbol pointing to this pipeline's stage entry point. - ".scratch_memory_size" integer Scratch memory size in bytes. - ".lds_size" integer Local Data Share size in bytes. - ".perf_data_buffer_size" integer Performance data buffer size in bytes. - ".vgpr_count" integer Number of VGPRs used. - ".agpr_count" integer Number of AGPRs used. - ".sgpr_count" integer Number of SGPRs used. - ".vgpr_limit" integer If non-zero, indicates the shader was compiled with a - directive to instruct the compiler to limit the VGPR usage to - be less than or equal to the specified value (only set if - different from HW default). - ".sgpr_limit" integer SGPR count upper limit (only set if different from HW - default). - ".threadgroup_dimensions" sequence of Thread-group X/Y/Z dimensions (Compute only). - 3 integers - ".wavefront_size" integer Wavefront size (only set if different from HW default). - ".uses_uavs" boolean The shader reads or writes UAVs. - ".uses_rovs" boolean The shader reads or writes ROVs. - ".writes_uavs" boolean The shader writes to one or more UAVs. - ".writes_depth" boolean The shader writes out a depth value. - ".uses_append_consume" boolean The shader uses append and/or consume operations, either - memory or GDS. - ".uses_prim_id" boolean The shader uses PrimID. - ========================== ============== ========= =============================================================== + =========================== ============== ========= =============================================================== + String Key Value Type Required? Description + =========================== ============== ========= =============================================================== + ".entry_point" string The ELF symbol pointing to this pipeline's stage entry point. + ".scratch_memory_size" integer Scratch memory size in bytes. + ".lds_size" integer Local Data Share size in bytes. + ".perf_data_buffer_size" integer Performance data buffer size in bytes. + ".vgpr_count" integer Number of VGPRs used. + ".agpr_count" integer Number of AGPRs used. + ".sgpr_count" integer Number of SGPRs used. + ".dynamic_vgpr_saved_count" integer No Number of dynamic VGPRs that can be stored in scratch by the + CWSR trap handler. Only used on GFX12+. + ".vgpr_limit" integer If non-zero, indicates the shader was compiled with a + directive to instruct the compiler to limit the VGPR usage to + be less than or equal to the specified value (only set if + different from HW default). + ".sgpr_limit" integer SGPR count upper limit (only set if different from HW + default). + ".threadgroup_dimensions" sequence of Thread-group X/Y/Z dimensions (Compute only). + 3 integers + ".wavefront_size" integer Wavefront size (only set if different from HW default). + ".uses_uavs" boolean The shader reads or writes UAVs. + ".uses_rovs" boolean The shader reads or writes ROVs. + ".writes_uavs" boolean The shader writes to one or more UAVs. + ".writes_depth" boolean The shader writes out a depth value. + ".uses_append_consume" boolean The shader uses append and/or consume operations, either + memory or GDS. + ".uses_prim_id" boolean The shader uses PrimID. + =========================== ============== ========= =============================================================== .. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index beffebaaa66d4..800e2b9c0e657 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1440,8 +1440,15 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, MD->setEntryPoint(CC, MF.getFunction().getName()); MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx); - // Only set AGPRs for supported devices + // For targets that support dynamic VGPRs, set the number of saved dynamic + // VGPRs (if any) in the PAL metadata. const GCNSubtarget &STM = MF.getSubtarget(); + if (STM.isDynamicVGPREnabled() && + MFI->getScratchReservedForDynamicVGPRs() > 0) + MD->setHwStage(CC, ".dynamic_vgpr_saved_count", + MFI->getScratchReservedForDynamicVGPRs() / 4); + + // Only set AGPRs for supported devices if (STM.hasMAIInsts()) { MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 721601efcc804..8e811b43a4532 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -552,6 +552,7 @@ enum Id { // HwRegCode, (6) [5:0] enum Offset : unsigned { // Offset, (5) [10:6] OFFSET_MEM_VIOL = 8, + OFFSET_ME_ID = 8, // in HW_ID2 }; enum ModeRegisterMasks : uint32_t { diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 97736e2410c18..9c737b4f3e378 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -691,17 +691,62 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); - if (hasFP(MF)) { + unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST); + if (!mayReserveScratchForCWSR(MF)) { + if (hasFP(MF)) { + Register FPReg = MFI->getFrameOffsetReg(); + assert(FPReg != AMDGPU::FP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); + } + + if (requiresStackPointerReference(MF)) { + Register SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset); + } + } else { + // We need to check if we're on a compute queue - if we are, then the CWSR + // trap handler may need to store some VGPRs on the stack. The first VGPR + // block is saved separately, so we only need to allocate space for any + // additional VGPR blocks used. For now, we will make sure there's enough + // room for the theoretical maximum number of VGPRs that can be allocated. + // FIXME: Figure out if the shader uses fewer VGPRs in practice. + assert(hasFP(MF)); Register FPReg = MFI->getFrameOffsetReg(); assert(FPReg != AMDGPU::FP_REG); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); - } - - if (requiresStackPointerReference(MF)) { - Register SPReg = MFI->getStackPtrOffsetReg(); - assert(SPReg != AMDGPU::SP_REG); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) - .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST)); + unsigned VGPRSize = + llvm::alignTo((ST.getAddressableNumVGPRs() - + AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) * + 4, + FrameInfo.getMaxAlign()); + MFI->setScratchReservedForDynamicVGPRs(VGPRSize); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg) + .addImm(AMDGPU::Hwreg::HwregEncoding::encode( + AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2)); + // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute + // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set + // SCC, so we need to check for 0 manually. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize); + if (requiresStackPointerReference(MF)) { + Register SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + + // If at least one of the constants can be inlined, then we can use + // s_cselect. Otherwise, use a mov and cmovk. + if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm()) || + AMDGPU::isInlinableLiteral32(Offset + VGPRSize, + ST.hasInv2PiInlineImm())) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CSELECT_B32), SPReg) + .addImm(Offset + VGPRSize) + .addImm(Offset); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), SPReg) + .addImm(Offset + VGPRSize); + } + } } bool NeedsFlatScratchInit = @@ -1831,9 +1876,17 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || MF.getSubtarget().getRegisterInfo()->hasStackRealignment( MF) || + mayReserveScratchForCWSR(MF) || MF.getTarget().Options.DisableFramePointerElim(MF); } +bool SIFrameLowering::mayReserveScratchForCWSR( + const MachineFunction &MF) const { + return MF.getSubtarget().isDynamicVGPREnabled() && + AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) && + AMDGPU::isCompute(MF.getFunction().getCallingConv()); +} + // This is essentially a reduced version of hasFP for entry functions. Since the // stack pointer is known 0 on entry to kernels, we never really need an FP // register. We may need to initialize the stack pointer depending on the frame diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 938c75099a3bc..9dac4bc8951e5 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -86,6 +86,10 @@ class SIFrameLowering final : public AMDGPUFrameLowering { public: bool requiresStackPointerReference(const MachineFunction &MF) const; + + // Returns true if the function may need to reserve space on the stack for the + // CWSR trap handler. + bool mayReserveScratchForCWSR(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 93b030b0e0a70..efdf642e29db3 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -715,7 +715,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()), MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()), - Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()) { + Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()), + ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) { for (Register Reg : MFI.getSGPRSpillPhysVGPRs()) SpillPhysVGPRS.push_back(regToString(Reg, TRI)); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 740f752bc93b7..a60409b5a7e09 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -299,6 +299,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { bool HasInitWholeWave = false; + unsigned ScratchReservedForDynamicVGPRs = 0; + SIMachineFunctionInfo() = default; SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, const TargetRegisterInfo &TRI, @@ -350,6 +352,8 @@ template <> struct MappingTraits { YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg, StringValue()); YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false); + YamlIO.mapOptional("scratchReservedForDynamicVGPRs", + MFI.ScratchReservedForDynamicVGPRs, 0); } }; @@ -455,6 +459,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, unsigned NumSpilledSGPRs = 0; unsigned NumSpilledVGPRs = 0; + // The size in bytes of the scratch space reserved for the CWSR trap handler + // to spill some of the dynamic VGPRs. + unsigned ScratchReservedForDynamicVGPRs = 0; + // Tracks information about user SGPRs that will be setup by hardware which // will apply to all wavefronts of the grid. GCNUserSGPRUsageInfo UserSGPRInfo; @@ -780,6 +788,15 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, BytesInStackArgArea = Bytes; } + // This is only used if we need to save any dynamic VGPRs in scratch. + unsigned getScratchReservedForDynamicVGPRs() const { + return ScratchReservedForDynamicVGPRs; + } + + void setScratchReservedForDynamicVGPRs(unsigned SizeInBytes) { + ScratchReservedForDynamicVGPRs = SizeInBytes; + } + // Add user SGPRs. Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI); Register addDispatchPtr(const SIRegisterInfo &TRI); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 39df8bb3a9fc8..c1ac9491b2363 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -511,6 +511,7 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const SIFrameLowering *TFI = ST.getFrameLowering(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + // During ISel lowering we always reserve the stack pointer in entry and chain // functions, but never actually want to reference it when accessing our own // frame. If we need a frame pointer we use it, but otherwise we can just use diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll new file mode 100644 index 0000000000000..ca2fca69dcf21 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll @@ -0,0 +1,251 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefix=CHECK %s + +; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack. + +define amdgpu_cs void @amdgpu_cs() #0 { +; CHECK-LABEL: amdgpu_cs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + ret void +} + +define amdgpu_kernel void @kernel() #0 { +; CHECK-LABEL: kernel: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + ret void +} + +define amdgpu_cs void @with_local() #0 { +; CHECK-LABEL: with_local: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: v_mov_b32_e32 v0, 13 +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 13, ptr addrspace(5) %local + ret void +} + +; Check that we generate s_cselect for SP if we can fit +; the offset in an inline constant. +define amdgpu_cs void @with_calls_inline_const() #0 { +; CHECK-LABEL: with_calls_inline_const: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: v_mov_b32_e32 v0, 15 +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: s_cselect_b32 s32, 0x1d0, 16 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +; Check that we generate s_mov + s_cmovk if we can't +; fit the offset for SP in an inline constant. +define amdgpu_cs void @with_calls_no_inline_const() #0 { +; CHECK-LABEL: with_calls_no_inline_const: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: v_mov_b32_e32 v0, 15 +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: s_movk_i32 s32, 0x100 +; CHECK-NEXT: s_cmovk_i32 s32, 0x2c0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %local = alloca i32, i32 61, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +define amdgpu_cs void @with_spills() { +; CHECK-LABEL: with_spills: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + call void asm "; spills", "~{v40},~{v42}"() + ret void +} + +define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 { +; CHECK-LABEL: realign_stack: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: s_cmovk_i32 s33, 0x200 +; CHECK-NEXT: s_movk_i32 s32, 0x100 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112 +; CHECK-NEXT: scratch_store_b128 off, v[24:27], s33 offset:96 +; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80 +; CHECK-NEXT: scratch_store_b128 off, v[16:19], s33 offset:64 +; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48 +; CHECK-NEXT: scratch_store_b128 off, v[8:11], s33 offset:32 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16 +; CHECK-NEXT: scratch_store_b128 off, v[0:3], s33 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: s_cmovk_i32 s32, 0x300 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %v = alloca <32 x i32>, align 128, addrspace(5) + store <32 x i32> %x, ptr addrspace(5) %v + call amdgpu_gfx void @callee(i32 71) + ret void +} + +define amdgpu_cs void @frame_pointer_none() #1 { +; CHECK-LABEL: frame_pointer_none: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: v_mov_b32_e32 v0, 13 +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 13, ptr addrspace(5) %local + ret void +} + +define amdgpu_cs void @frame_pointer_all() #2 { +; CHECK-LABEL: frame_pointer_all: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: v_mov_b32_e32 v0, 13 +; CHECK-NEXT: s_cmp_lg_u32 0, s33 +; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 +; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 13, ptr addrspace(5) %local + ret void +} + +; Non-entry functions and graphics shaders don't need to worry about CWSR. +define amdgpu_gs void @amdgpu_gs() #0 { +; CHECK-LABEL: amdgpu_gs: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_mov_b32_e32 v0, 15 +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: s_mov_b32 s32, 16 +; CHECK-NEXT: scratch_store_b8 off, v0, off scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: s_alloc_vgpr 0 +; CHECK-NEXT: s_endpgm + %local = alloca i32, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +define amdgpu_gfx void @amdgpu_gfx() #0 { +; CHECK-LABEL: amdgpu_gfx: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_mov_b32 s0, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: v_writelane_b32 v40, s0, 2 +; CHECK-NEXT: v_mov_b32_e32 v0, 15 +; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi +; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo +; CHECK-NEXT: s_add_co_i32 s32, s32, 16 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x47 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s0, v40, 2 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: s_mov_b32 s33, s0 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_setpc_b64 s[30:31] + %local = alloca i32, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +define void @default() #0 { +; CHECK-LABEL: default: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + ret void +} + +declare amdgpu_gfx void @callee(i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind "frame-pointer"="none" } +attributes #2 = { nounwind "frame-pointer"="all" } diff --git a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll new file mode 100644 index 0000000000000..2de6699aab665 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll @@ -0,0 +1,72 @@ +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s + +; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack. + +define amdgpu_cs void @amdgpu_cs() #0 { +; CHECK-LABEL: {{^}}name: amdgpu_cs +; CHECK: scratchReservedForDynamicVGPRs: 448 + ret void +} + +define amdgpu_kernel void @kernel() #0 { +; CHECK-LABEL: {{^}}name: kernel +; CHECK: scratchReservedForDynamicVGPRs: 448 + ret void +} + +define amdgpu_cs void @with_local() #0 { +; CHECK-LABEL: {{^}}name: with_local +; CHECK: scratchReservedForDynamicVGPRs: 448 + %local = alloca i32, addrspace(5) + store volatile i8 13, ptr addrspace(5) %local + ret void +} + +define amdgpu_cs void @with_calls() #0 { +; CHECK-LABEL: {{^}}name: with_calls +; CHECK: scratchReservedForDynamicVGPRs: 448 + %local = alloca i32, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 { +; CHECK-LABEL: {{^}}name: realign_stack +; CHECK: scratchReservedForDynamicVGPRs: 512 + %v = alloca <32 x i32>, align 128, addrspace(5) + store <32 x i32> %x, ptr addrspace(5) %v + call amdgpu_gfx void @callee(i32 71) + ret void +} + +; Non-entry functions and graphics shaders can't run on a compute queue, +; so they don't need to worry about CWSR. +define amdgpu_gs void @amdgpu_gs() #0 { +; CHECK-LABEL: {{^}}name: amdgpu_gs +; CHECK: scratchReservedForDynamicVGPRs: 0 + %local = alloca i32, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +define amdgpu_gfx void @amdgpu_gfx() #0 { +; CHECK-LABEL: {{^}}name: amdgpu_gfx +; CHECK: scratchReservedForDynamicVGPRs: 0 + %local = alloca i32, addrspace(5) + store volatile i8 15, ptr addrspace(5) %local + call amdgpu_gfx void @callee(i32 71) + ret void +} + +define void @default() #0 { +; CHECK-LABEL: {{^}}name: default +; CHECK: scratchReservedForDynamicVGPRs: 0 + ret void +} + +declare amdgpu_gfx void @callee(i32) #0 + +attributes #0 = { nounwind } + diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll index fa22089978c2e..5748f6b188acf 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll @@ -1,9 +1,10 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11 -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11,NODVGPR +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK,NODVGPR ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr <%s | FileCheck %s --check-prefixes=CHECK,DVGPR ; CHECK-LABEL: {{^}}_amdgpu_cs_main: -; CHECK: ; TotalNumSgprs: 4 +; NODVGPR: ; TotalNumSgprs: 4 +; DVGPR: ; TotalNumSgprs: 34 ; CHECK: ; NumVgprs: 2 ; CHECK: .amdgpu_pal_metadata ; CHECK-NEXT: --- @@ -56,6 +57,7 @@ ; CHECK-NEXT: .cs: ; CHECK-NEXT: .checksum_value: 0x9444d7d0 ; CHECK-NEXT: .debug_mode: false +; DVGPR-NEXT: .dynamic_vgpr_saved_count: 0x70 ; CHECK-NEXT: .entry_point: _amdgpu_cs ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 @@ -66,7 +68,8 @@ ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false ; CHECK-NEXT: .scratch_memory_size: 0 -; CHECK-NEXT: .sgpr_count: 0x4 +; NODVGPR-NEXT: .sgpr_count: 0x4 +; DVGPR-NEXT: .sgpr_count: 0x22 ; CHECK-NEXT: .sgpr_limit: 0x6a ; CHECK-NEXT: .threadgroup_dimensions: ; CHECK-NEXT: - 0x1 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll index eb4ee118ec2e4..2bb31e926e39a 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll @@ -44,6 +44,7 @@ ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: hasInitWholeWave: false +; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: body: define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 { entry: @@ -311,6 +312,7 @@ ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: hasInitWholeWave: false +; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: body: define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 { entry: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll index 6f5467b00ebcc..a712cb5f7f3e3 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -44,6 +44,7 @@ ; AFTER-PEI-NEXT: sgprForEXECCopy: '' ; AFTER-PEI-NEXT: longBranchReservedReg: '' ; AFTER-PEI-NEXT: hasInitWholeWave: false +; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0 ; AFTER-PEI-NEXT: body: define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { %wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index b2f299d531f5c..e99a06f497016 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -44,6 +44,7 @@ ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: hasInitWholeWave: false +; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: body: define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 { bb0: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index 93f2c343cd051..076d7c9cd8842 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -44,6 +44,7 @@ ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: hasInitWholeWave: false +; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: body: define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 { bb0: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index 89d831b51f694..944b2aa4dc175 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -53,6 +53,7 @@ # FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: hasInitWholeWave: false +# FULL-NEXT: scratchReservedForDynamicVGPRs: 0 # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -158,6 +159,7 @@ body: | # FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: hasInitWholeWave: false +# FULL-NEXT: scratchReservedForDynamicVGPRs: 0 # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -234,6 +236,7 @@ body: | # FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: hasInitWholeWave: false +# FULL-NEXT: scratchReservedForDynamicVGPRs: 0 # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -311,6 +314,7 @@ body: | # FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: hasInitWholeWave: false +# FULL-NEXT: scratchReservedForDynamicVGPRs: 0 # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index ec56de11b250a..dfe3e33e8b3ec 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -54,6 +54,7 @@ ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: hasInitWholeWave: false +; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: body: define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0 @@ -101,6 +102,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: hasInitWholeWave: false +; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: body: define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0 @@ -172,6 +174,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 { ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: hasInitWholeWave: false +; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: body: define void @function() { ret void @@ -225,6 +228,7 @@ define void @function() { ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: hasInitWholeWave: false +; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: body: define void @function_nsz() #0 { ret void