diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 9798e5437be5e..174a497c51b26 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -4263,10 +4263,9 @@ same *vendor-name*. wavefront for GFX6-GFX9. A register is required if it is - used explicitly, or + written to, or if a higher numbered - register is used - explicitly. This + register is written to. This includes the special SGPRs for VCC, Flat Scratch (GFX7-GFX9) @@ -4284,10 +4283,10 @@ same *vendor-name*. each work-item for GFX6-GFX9. A register is required if it is - used explicitly, or + written to, or if a higher numbered - register is used - explicitly. + register is + written to. ".agpr_count" integer Required Number of accumulator registers required by each work-item for diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 491314daf2d81..d4fea30f21f45 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -989,7 +989,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // dispatch registers are function args. unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; - if (isShader(F.getCallingConv())) { + if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) { bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); @@ -1060,15 +1060,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); - } else if (isKernel(F.getCallingConv()) && - MFI->getNumKernargPreloadedSGPRs()) { - // Consider cases where the total number of UserSGPRs with trailing - // allocated preload SGPRs, is greater than the number of explicitly - // referenced SGPRs. - const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd( - CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx); - ProgInfo.NumSGPR = - AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx); } // Adjust number of registers used to meet default/requested minimum/maximum diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 9a609a1752de0..7bde59412d905 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -137,274 +137,29 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( if (MFI->isStackRealigned()) Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); - Info.UsesVCC = - MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); - - // If there are no calls, MachineRegisterInfo can tell us the used register - // count easily. - // A tail call isn't considered a call for MachineFrameInfo's purposes. - if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { - Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass); - Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass); - if (ST.hasMAIInsts()) - Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass); - return Info; + Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC); + + Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass); + Info.NumExplicitSGPR = + TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass); + if (ST.hasMAIInsts()) + Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass); + + // Preloaded registers are written by the hardware, not defined in the + // function body, so they need special handling. + if (MFI->isEntryFunction()) { + Info.NumExplicitSGPR = + std::max(Info.NumExplicitSGPR, MFI->getNumPreloadedSGPRs()); + Info.NumVGPR = std::max(Info.NumVGPR, MFI->getNumPreloadedVGPRs()); } - int32_t MaxVGPR = -1; - int32_t MaxAGPR = -1; - int32_t MaxSGPR = -1; + if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) + return Info; + Info.CalleeSegmentSize = 0; for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { - // TODO: Check regmasks? Do they occur anywhere except calls? - for (const MachineOperand &MO : MI.operands()) { - unsigned Width = 0; - bool IsSGPR = false; - bool IsAGPR = false; - - if (!MO.isReg()) - continue; - - Register Reg = MO.getReg(); - switch (Reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - case AMDGPU::M0_LO16: - case AMDGPU::M0_HI16: - case AMDGPU::SRC_SHARED_BASE_LO: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT_LO: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE_LO: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT_LO: - case AMDGPU::SRC_PRIVATE_LIMIT: - case AMDGPU::SRC_POPS_EXITING_WAVE_ID: - case AMDGPU::SGPR_NULL: - case AMDGPU::SGPR_NULL64: - case AMDGPU::MODE: - continue; - - case AMDGPU::NoRegister: - assert(MI.isDebugInstr() && - "Instruction uses invalid noreg register"); - continue; - - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - case AMDGPU::VCC_LO_LO16: - case AMDGPU::VCC_LO_HI16: - case AMDGPU::VCC_HI_LO16: - case AMDGPU::VCC_HI_HI16: - Info.UsesVCC = true; - continue; - - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - continue; - - case AMDGPU::XNACK_MASK: - case AMDGPU::XNACK_MASK_LO: - case AMDGPU::XNACK_MASK_HI: - llvm_unreachable("xnack_mask registers should not be used"); - - case AMDGPU::LDS_DIRECT: - llvm_unreachable("lds_direct register should not be used"); - - case AMDGPU::TBA: - case AMDGPU::TBA_LO: - case AMDGPU::TBA_HI: - case AMDGPU::TMA: - case AMDGPU::TMA_LO: - case AMDGPU::TMA_HI: - llvm_unreachable("trap handler registers should not be used"); - - case AMDGPU::SRC_VCCZ: - llvm_unreachable("src_vccz register should not be used"); - - case AMDGPU::SRC_EXECZ: - llvm_unreachable("src_execz register should not be used"); - - case AMDGPU::SRC_SCC: - llvm_unreachable("src_scc register should not be used"); - - default: - break; - } - - if (AMDGPU::SGPR_32RegClass.contains(Reg) || - AMDGPU::SGPR_LO16RegClass.contains(Reg) || - AMDGPU::SGPR_HI16RegClass.contains(Reg)) { - IsSGPR = true; - Width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || - AMDGPU::VGPR_16RegClass.contains(Reg)) { - IsSGPR = false; - Width = 1; - } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || - AMDGPU::AGPR_LO16RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 1; - } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { - IsSGPR = true; - Width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { - IsSGPR = false; - Width = 2; - } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { - IsSGPR = false; - Width = 3; - } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { - IsSGPR = true; - Width = 3; - } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 3; - } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { - IsSGPR = true; - Width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { - IsSGPR = false; - Width = 4; - } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 4; - } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { - IsSGPR = false; - Width = 5; - } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { - IsSGPR = true; - Width = 5; - } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 5; - } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { - IsSGPR = false; - Width = 6; - } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { - IsSGPR = true; - Width = 6; - } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 6; - } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { - IsSGPR = false; - Width = 7; - } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { - IsSGPR = true; - Width = 7; - } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 7; - } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { - IsSGPR = true; - Width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { - IsSGPR = false; - Width = 8; - } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 8; - } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { - IsSGPR = false; - Width = 9; - } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { - IsSGPR = true; - Width = 9; - } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 9; - } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { - IsSGPR = false; - Width = 10; - } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { - IsSGPR = true; - Width = 10; - } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 10; - } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { - IsSGPR = false; - Width = 11; - } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { - IsSGPR = true; - Width = 11; - } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 11; - } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { - IsSGPR = false; - Width = 12; - } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { - IsSGPR = true; - Width = 12; - } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 12; - } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { - IsSGPR = true; - Width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { - IsSGPR = false; - Width = 16; - } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 16; - } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { - IsSGPR = true; - Width = 32; - } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - Width = 32; - } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 32; - } else { - // We only expect TTMP registers or registers that do not belong to - // any RC. - assert((AMDGPU::TTMP_32RegClass.contains(Reg) || - AMDGPU::TTMP_64RegClass.contains(Reg) || - AMDGPU::TTMP_128RegClass.contains(Reg) || - AMDGPU::TTMP_256RegClass.contains(Reg) || - AMDGPU::TTMP_512RegClass.contains(Reg) || - !TRI.getPhysRegBaseClass(Reg)) && - "Unknown register class"); - } - unsigned HWReg = TRI.getHWRegIndex(Reg); - int MaxUsed = HWReg + Width - 1; - if (IsSGPR) { - MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; - } else if (IsAGPR) { - MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; - } else { - MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; - } - } - if (MI.isCall()) { // Pseudo used just to encode the underlying global. Is there a better // way to track this? @@ -464,9 +219,5 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( } } - Info.NumExplicitSGPR = MaxSGPR + 1; - Info.NumVGPR = MaxVGPR + 1; - Info.NumAGPR = MaxAGPR + 1; - return Info; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 0e7635a045588..01718faaf5c2e 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -970,10 +970,25 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, return NumUserSGPRs; } + // Get the number of preloaded SGPRs for compute kernels. unsigned getNumPreloadedSGPRs() const { return NumUserSGPRs + NumSystemSGPRs; } + // Get the number of preloaded VGPRs for compute kernels. + unsigned getNumPreloadedVGPRs() const { + if (hasWorkItemIDZ()) + return ArgInfo.WorkItemIDZ.getRegister() - AMDGPU::VGPR0 + 1; + + if (hasWorkItemIDY()) + return ArgInfo.WorkItemIDY.getRegister() - AMDGPU::VGPR0 + 1; + + if (hasWorkItemIDX()) + return ArgInfo.WorkItemIDX.getRegister() - AMDGPU::VGPR0 + 1; + + return 0; + } + unsigned getNumKernargPreloadedSGPRs() const { return UserSGPRInfo.getNumKernargPreloadSGPRs(); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index e41189adfb46f..511ea4125c8ec 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -4055,6 +4055,20 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, return 0; } +unsigned +SIRegisterInfo::getNumDefinedPhysRegs(const MachineRegisterInfo &MRI, + const TargetRegisterClass &RC) const { + for (MCPhysReg Reg : reverse(RC.getRegisters())) { + for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) { + if (std::any_of( + MRI.def_instr_begin(*AI), MRI.def_instr_end(), + [](const MachineInstr &MI) { return !MI.isImplicitDef(); })) + return getHWRegIndex(Reg) + 1; + } + } + return 0; +} + SmallVector SIRegisterInfo::getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index a4b135d5e0b59..7726762ad0e6d 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -486,6 +486,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC) const; + // \returns the number of registers of a given \p RC defined in a function. + // Does not go inside function calls. + unsigned getNumDefinedPhysRegs(const MachineRegisterInfo &MRI, + const TargetRegisterClass &RC) const; + std::optional getVRegFlagValue(StringRef Name) const override { return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG : std::optional{}; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 9b35920f8547a..bdd86c1af6248 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 17 +; GPRIDX-NEXT: wavefront_sgpr_count = 24 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 2 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -3245,7 +3245,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: gds_segment_byte_size = 0 ; GFX10-NEXT: kernarg_segment_byte_size = 28 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 10 +; GFX10-NEXT: wavefront_sgpr_count = 18 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -3294,7 +3294,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: kernel_code_entry_byte_offset = 256 ; GFX11-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX11-NEXT: granulated_workitem_vgpr_count = 0 -; GFX11-NEXT: granulated_wavefront_sgpr_count = 0 +; GFX11-NEXT: granulated_wavefront_sgpr_count = 1 ; GFX11-NEXT: priority = 0 ; GFX11-NEXT: float_mode = 240 ; GFX11-NEXT: priv = 0 @@ -3337,7 +3337,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: gds_segment_byte_size = 0 ; GFX11-NEXT: kernarg_segment_byte_size = 28 ; GFX11-NEXT: workgroup_fbarrier_count = 0 -; GFX11-NEXT: wavefront_sgpr_count = 7 +; GFX11-NEXT: wavefront_sgpr_count = 16 ; GFX11-NEXT: workitem_vgpr_count = 3 ; GFX11-NEXT: reserved_vgpr_first = 0 ; GFX11-NEXT: reserved_vgpr_count = 0 @@ -4034,7 +4034,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256 ; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0 ; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0 -; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1 +; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2 ; GPRIDX-NEXT: priority = 0 ; GPRIDX-NEXT: float_mode = 240 ; GPRIDX-NEXT: priv = 0 @@ -4077,8 +4077,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 16 -; GPRIDX-NEXT: workitem_vgpr_count = 2 +; GPRIDX-NEXT: wavefront_sgpr_count = 24 +; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 ; GPRIDX-NEXT: reserved_sgpr_first = 0 @@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 2 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -4249,8 +4249,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: gds_segment_byte_size = 0 ; GFX10-NEXT: kernarg_segment_byte_size = 28 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 10 -; GFX10-NEXT: workitem_vgpr_count = 2 +; GFX10-NEXT: wavefront_sgpr_count = 18 +; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 ; GFX10-NEXT: reserved_sgpr_first = 0 @@ -4291,7 +4291,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: kernel_code_entry_byte_offset = 256 ; GFX11-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX11-NEXT: granulated_workitem_vgpr_count = 0 -; GFX11-NEXT: granulated_wavefront_sgpr_count = 0 +; GFX11-NEXT: granulated_wavefront_sgpr_count = 1 ; GFX11-NEXT: priority = 0 ; GFX11-NEXT: float_mode = 240 ; GFX11-NEXT: priv = 0 @@ -4334,7 +4334,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: gds_segment_byte_size = 0 ; GFX11-NEXT: kernarg_segment_byte_size = 28 ; GFX11-NEXT: workgroup_fbarrier_count = 0 -; GFX11-NEXT: wavefront_sgpr_count = 6 +; GFX11-NEXT: wavefront_sgpr_count = 16 ; GFX11-NEXT: workitem_vgpr_count = 2 ; GFX11-NEXT: reserved_vgpr_first = 0 ; GFX11-NEXT: reserved_vgpr_count = 0 @@ -4382,7 +4382,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256 ; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0 ; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0 -; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1 +; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2 ; GPRIDX-NEXT: priority = 0 ; GPRIDX-NEXT: float_mode = 240 ; GPRIDX-NEXT: priv = 0 @@ -4425,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 16 +; GPRIDX-NEXT: wavefront_sgpr_count = 24 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 2 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -4603,7 +4603,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: gds_segment_byte_size = 0 ; GFX10-NEXT: kernarg_segment_byte_size = 28 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 10 +; GFX10-NEXT: wavefront_sgpr_count = 18 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -4648,7 +4648,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: kernel_code_entry_byte_offset = 256 ; GFX11-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX11-NEXT: granulated_workitem_vgpr_count = 0 -; GFX11-NEXT: granulated_wavefront_sgpr_count = 0 +; GFX11-NEXT: granulated_wavefront_sgpr_count = 1 ; GFX11-NEXT: priority = 0 ; GFX11-NEXT: float_mode = 240 ; GFX11-NEXT: priv = 0 @@ -4691,7 +4691,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: gds_segment_byte_size = 0 ; GFX11-NEXT: kernarg_segment_byte_size = 28 ; GFX11-NEXT: workgroup_fbarrier_count = 0 -; GFX11-NEXT: wavefront_sgpr_count = 7 +; GFX11-NEXT: wavefront_sgpr_count = 16 ; GFX11-NEXT: workitem_vgpr_count = 3 ; GFX11-NEXT: reserved_vgpr_first = 0 ; GFX11-NEXT: reserved_vgpr_count = 0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll index 7bf9a29e9ff44..cc614bb24839c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll @@ -13,8 +13,9 @@ ; CHECK: {{^}}kernel_illegal_agpr_use_asm: ; CHECK: ; use a0 -; CHECK: NumVgprs: 0 -; CHECK: NumAgprs: 1 +; GFX908: NumVgprs: 3 +; GFX90A: NumVgprs: 1 +; CHECK: NumAgprs: 0 define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 { call void asm sideeffect "; use $0", "a"(i32 poison) ret void @@ -24,7 +25,7 @@ define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 { ; CHECK: ; use a0 ; CHECK: NumVgprs: 0 -; CHECK: NumAgprs: 1 +; CHECK: NumAgprs: 0 define void @func_illegal_agpr_use_asm() #0 { call void asm sideeffect "; use $0", "a"(i32 poison) ret void diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll index dd760c2a215ca..7851de641c5a3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll @@ -10,9 +10,9 @@ ; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6: ; ASM: .amdhsa_user_sgpr_count 12 -; ASM: .amdhsa_next_free_sgpr 12 -; ASM: ; TotalNumSgprs: 18 -; ASM: ; NumSGPRsForWavesPerEU: 18 +; ASM: .amdhsa_next_free_sgpr 15 +; ASM: ; TotalNumSgprs: 21 +; ASM: ; NumSGPRsForWavesPerEU: 21 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT ; feild that are not explicitly referenced in the kernel. This test has 6 implicit @@ -26,13 +26,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret ; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000 ........ ....... ; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 ................ ; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 ................ -; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000 @............... +; OBJDUMP-NEXT: 0070 8000af00 94000000 08000800 00000000 ................ ; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2: ; ASM: .amdhsa_user_sgpr_count 10 -; ASM: .amdhsa_next_free_sgpr 10 -; ASM: ; TotalNumSgprs: 16 -; ASM: ; NumSGPRsForWavesPerEU: 16 +; ASM: .amdhsa_next_free_sgpr 11 +; ASM: ; TotalNumSgprs: 17 +; ASM: ; NumSGPRsForWavesPerEU: 17 ; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2 ; implicit, and 6 extra. @@ -46,9 +46,9 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2: ; ASM: .amdhsa_user_sgpr_count 3 -; ASM: .amdhsa_next_free_sgpr 3 -; ASM: ; TotalNumSgprs: 9 -; ASM: ; NumSGPRsForWavesPerEU: 9 +; ASM: .amdhsa_next_free_sgpr 4 +; ASM: ; TotalNumSgprs: 10 +; ASM: ; NumSGPRsForWavesPerEU: 10 ; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD. @@ -57,13 +57,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r ; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000 ................ ; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 ................ ; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 ................ -; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................ +; OBJDUMP-NEXT: 00f0 4000af00 84000000 08000000 00000000 @............... ; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2: ; ASM: .amdhsa_user_sgpr_count 2 -; ASM: .amdhsa_next_free_sgpr 0 -; ASM: ; TotalNumSgprs: 6 -; ASM: ; NumSGPRsForWavesPerEU: 6 +; ASM: .amdhsa_next_free_sgpr 3 +; ASM: ; TotalNumSgprs: 9 +; ASM: ; NumSGPRsForWavesPerEU: 9 ; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD. ; Encoded like '00'. diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll index f4d17e50cf18c..494ade73cb5f8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -142,8 +142,8 @@ attributes #0 = { nounwind } ; GCN: amdpal.pipelines: ; GCN-NEXT: - .registers: -; SDAG-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}} -; GISEL-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}} +; GFX8-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf010a{{$}} +; GFX9-NEXT: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf014a{{$}} ; GCN-NEXT: '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}} ; GCN-NEXT: .shader_functions: ; GCN-NEXT: dynamic_stack: @@ -164,13 +164,13 @@ attributes #0 = { nounwind } ; GCN-NEXT: multiple_stack: ; GCN-NEXT: .backend_stack_size: 0x24{{$}} ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x21{{$}} +; GCN-NEXT: .sgpr_count: 0x1{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x24{{$}} ; GCN-NEXT: .vgpr_count: 0x3{{$}} ; GCN-NEXT: no_stack: ; GCN-NEXT: .backend_stack_size: 0{{$}} ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x20{{$}} +; GCN-NEXT: .sgpr_count: 0x1{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}} ; GCN-NEXT: .vgpr_count: 0x1{{$}} ; GCN-NEXT: no_stack_call: @@ -203,7 +203,7 @@ attributes #0 = { nounwind } ; GCN-NEXT: simple_lds: ; GCN-NEXT: .backend_stack_size: 0{{$}} ; GCN-NEXT: .lds_size: 0x100{{$}} -; GCN-NEXT: .sgpr_count: 0x20{{$}} +; GCN-NEXT: .sgpr_count: 0x1{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}} ; GCN-NEXT: .vgpr_count: 0x1{{$}} ; GCN-NEXT: simple_lds_recurse: @@ -215,7 +215,7 @@ attributes #0 = { nounwind } ; GCN-NEXT: simple_stack: ; GCN-NEXT: .backend_stack_size: 0x14{{$}} ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x21{{$}} +; GCN-NEXT: .sgpr_count: 0x1{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x14{{$}} ; GCN-NEXT: .vgpr_count: 0x2{{$}} ; GCN-NEXT: simple_stack_call: diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll index f52ba7000edeb..5ccf41c408b72 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s -; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s -; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s +; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX11W32 %s +; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX11W64 %s ; ELF: Section { ; ELF: Name: .text @@ -23,8 +23,16 @@ ; ELF: Section: .text (0x2) ; ELF: } -; GFX10: NumSGPRsForWavesPerEU: 6 -; GFX10: NumVGPRsForWavesPerEU: 1 +; GFX10: NumSGPRsForWavesPerEU: 12 +; GFX10: NumVGPRsForWavesPerEU: 3 + +; Wave32 and 64 behave differently due to the UserSGPRInit16Bug, +; which only affects Wave32. +; GFX11W32: NumSGPRsForWavesPerEU: 16 +; GFX11W32: NumVGPRsForWavesPerEU: 1 + +; GFX11W64: NumSGPRsForWavesPerEU: 11 +; GFX11W64: NumVGPRsForWavesPerEU: 1 define amdgpu_kernel void @simple(ptr addrspace(1) %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll index 616867481d177..0e0a81d4657df 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -2,10 +2,10 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=HSAMD %s ; CHECK-LABEL: {{^}}min_64_max_64: -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 -; CHECK: NumVGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 3 define amdgpu_kernel void @min_64_max_64() #0 { entry: ret void @@ -13,10 +13,10 @@ entry: attributes #0 = {"amdgpu-flat-work-group-size"="64,64"} ; CHECK-LABEL: {{^}}min_64_max_128: -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 -; CHECK: NumVGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 3 define amdgpu_kernel void @min_64_max_128() #1 { entry: ret void diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index e9fe4f3c618c7..5617a80fc94b4 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -26,10 +26,10 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"} ; Exactly 10 waves per execution unit. ; CHECK-LABEL: {{^}}empty_exactly_10: -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 -; CHECK: NumVGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 3 define amdgpu_kernel void @empty_exactly_10() #2 { entry: ret void @@ -38,10 +38,10 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"} ; At least 1 wave per execution unit. ; CHECK-LABEL: {{^}}empty_at_least_1: -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 -; CHECK: NumVGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 3 define amdgpu_kernel void @empty_at_least_1() #3 { entry: ret void @@ -50,10 +50,10 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"} ; At least 5 waves per execution unit. ; CHECK-LABEL: {{^}}empty_at_least_5: -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 -; CHECK: NumVGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 3 define amdgpu_kernel void @empty_at_least_5() #4 { entry: ret void @@ -62,10 +62,10 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"} ; At least 10 waves per execution unit. ; CHECK-LABEL: {{^}}empty_at_least_10: -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 -; CHECK: NumVGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 3 define amdgpu_kernel void @empty_at_least_10() #5 { entry: ret void @@ -88,10 +88,10 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64 ; At most 10 waves per execution unit. ; CHECK-LABEL: {{^}}empty_at_most_10: -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 -; CHECK: NumVGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 3 define amdgpu_kernel void @empty_at_most_10() #7 { entry: ret void @@ -102,10 +102,10 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"} ; Between 5 and 10 waves per execution unit. ; CHECK-LABEL: {{^}}empty_between_5_and_10: -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 0 ; CHECK: NumSGPRsForWavesPerEU: 1 -; CHECK: NumVGPRsForWavesPerEU: 1 +; CHECK: NumVGPRsForWavesPerEU: 3 define amdgpu_kernel void @empty_between_5_and_10() #8 { entry: ret void diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll index 2e79d8bab46a6..efa416e301ccc 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll @@ -28,7 +28,7 @@ bb: } ; ALL: .set .Laliasee_default.num_vgpr, 0 ; ALL-NEXT: .set .Laliasee_default.num_agpr, 27 -; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 32 +; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 0 attributes #0 = { noinline norecurse nounwind optnone } attributes #1 = { noinline norecurse nounwind readnone willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll index 337da5d0ecbe0..62ca985bc6400 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll @@ -18,7 +18,7 @@ bb: ; CHECK: .set .Laliasee_default_vgpr64_sgpr102.num_vgpr, 53 ; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.num_agpr, 0 -; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 32 +; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 0 define internal void @aliasee_default_vgpr64_sgpr102() #1 { bb: call void asm sideeffect "; clobber v52 ", "~{v52}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll index 075eddd2763d3..344f8200608f6 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll @@ -24,7 +24,7 @@ bb: ; CHECK: .set .Laliasee_vgpr32_sgpr76.num_vgpr, 27 ; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.num_agpr, 0 -; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 32 +; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 0 define internal void @aliasee_vgpr32_sgpr76() #1 { bb: call void asm sideeffect "; clobber v26 ", "~{v26}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll index 4fd181d3c0f43..3d36f8a514c47 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll @@ -21,7 +21,7 @@ bb: ; CHECK: .set .Laliasee_vgpr64_sgpr102.num_vgpr, 53 ; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.num_agpr, 0 -; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 32 +; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 0 define internal void @aliasee_vgpr64_sgpr102() #1 { bb: call void asm sideeffect "; clobber v52 ", "~{v52}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll index 00f72d5d8b1dd..2274c437daf62 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll @@ -21,7 +21,7 @@ bb: ; CHECK: .set .Laliasee_vgpr256_sgpr102.num_vgpr, 253 ; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.num_agpr, 0 -; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 33 +; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 0 define internal void @aliasee_vgpr256_sgpr102() #1 { bb: call void asm sideeffect "; clobber v252 ", "~{v252}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index dbd00f09943c0..db1269e8e95c2 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -7,7 +7,7 @@ ; Make sure to run a GPU with the SGPR allocation bug. ; GCN-LABEL: {{^}}use_vcc: -; GCN: ; TotalNumSgprs: 34 +; GCN: ; TotalNumSgprs: 2 ; GCN: ; NumVgprs: 0 define void @use_vcc() #1 { call void asm sideeffect "", "~{vcc}" () #0 @@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) } ; GCN-LABEL: {{^}}use_flat_scratch: -; CI: ; TotalNumSgprs: 36 -; VI: ; TotalNumSgprs: 38 +; CI: ; TotalNumSgprs: 4 +; VI: ; TotalNumSgprs: 6 ; GCN: ; NumVgprs: 0 define void @use_flat_scratch() #1 { call void asm sideeffect "", "~{flat_scratch}" () #0 @@ -234,7 +234,7 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { ; Make sure there's no assert when a sgpr96 is used. ; GCN-LABEL: {{^}}count_use_sgpr96_external_call ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}] -; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr) +; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr) ; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) ; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4 ; VI-BUG: TotalNumSgprs: 96 @@ -249,7 +249,7 @@ entry: ; Make sure there's no assert when a sgpr160 is used. ; GCN-LABEL: {{^}}count_use_sgpr160_external_call ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}] -; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr) +; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr) ; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) ; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4 ; VI-BUG: TotalNumSgprs: 96 diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll index 61830f18ad7a7..55dc394628176 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll +++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll @@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float) ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; It's probably OK if this is slightly higher: -; CHECK: ; NumVgprs: 8 +; CHECK: ; NumVgprs: 5 define amdgpu_kernel void @foobar(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %flag) { entry: %cmpflag = icmp eq i32 %flag, 1 diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll index 3fe3cafd729a7..d8d7494d0dc1c 100644 --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -16,7 +16,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel @@ -35,7 +35,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 16 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel @@ -93,7 +93,7 @@ entry: ; registers used. ; ; ALL-ASM-LABEL: {{^}}empty: -; ALL-ASM: .amdhsa_next_free_vgpr 1 +; ALL-ASM: .amdhsa_next_free_vgpr 3 ; ALL-ASM: .amdhsa_next_free_sgpr 1 define amdgpu_kernel void @empty( i32 %i, diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll index 22d699a8f4809..59cf9825116fa 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll @@ -43,7 +43,7 @@ ; OSABI-HSA-ELF: .sgpr_count: 96 ; OSABI-HSA-ELF: .sgpr_spill_count: 0 ; OSABI-HSA-ELF: .symbol: elf_notes.kd -; OSABI-HSA-ELF: .vgpr_count: 0 +; OSABI-HSA-ELF: .vgpr_count: 1 ; OSABI-HSA-ELF: .vgpr_spill_count: 0 ; OSABI-HSA-ELF: .wavefront_size: 64 ; OSABI-HSA-ELF: amdhsa.target: amdgcn-amd-amdhsa--gfx802 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll index a59382ba20dc5..ed1f3e1397abc 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -27,15 +27,15 @@ ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; TotalNumSgprs: 8 -; VI-NOXNACK: ; TotalNumSgprs: 8 -; HSA-VI-NOXNACK: ; TotalNumSgprs: 8 -; VI-XNACK: ; TotalNumSgprs: 12 -; HSA-VI-XNACK: ; TotalNumSgprs: 12 -; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 -; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8 -; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8 +; CI: ; TotalNumSgprs: 12 +; VI-NOXNACK: ; TotalNumSgprs: 12 +; HSA-VI-NOXNACK: ; TotalNumSgprs: 18 +; VI-XNACK: ; TotalNumSgprs: 16 +; HSA-VI-XNACK: ; TotalNumSgprs: 22 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11 define amdgpu_kernel void @no_vcc_no_flat() { entry: call void asm sideeffect "", "~{s7}"() @@ -50,15 +50,15 @@ entry: ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; TotalNumSgprs: 10 -; VI-NOXNACK: ; TotalNumSgprs: 10 -; HSA-VI-NOXNACK: ; TotalNumSgprs: 10 -; VI-XNACK: ; TotalNumSgprs: 12 -; HSA-VI-XNACK: ; TotalNumSgprs: 12 -; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 -; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10 -; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10 +; CI: ; TotalNumSgprs: 14 +; VI-NOXNACK: ; TotalNumSgprs: 14 +; HSA-VI-NOXNACK: ; TotalNumSgprs: 20 +; VI-XNACK: ; TotalNumSgprs: 16 +; HSA-VI-XNACK: ; TotalNumSgprs: 22 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13 define amdgpu_kernel void @vcc_no_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc}"() @@ -73,15 +73,15 @@ entry: ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; TotalNumSgprs: 12 -; VI-NOXNACK: ; TotalNumSgprs: 14 +; CI: ; TotalNumSgprs: 16 +; VI-NOXNACK: ; TotalNumSgprs: 18 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24 -; VI-XNACK: ; TotalNumSgprs: 14 +; VI-XNACK: ; TotalNumSgprs: 18 ; HSA-VI-XNACK: ; TotalNumSgprs: 24 -; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 -; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8 -; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{flat_scratch}"() @@ -96,15 +96,15 @@ entry: ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; TotalNumSgprs: 12 -; VI-NOXNACK: ; TotalNumSgprs: 14 +; CI: ; TotalNumSgprs: 16 +; VI-NOXNACK: ; TotalNumSgprs: 18 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24 -; VI-XNACK: ; TotalNumSgprs: 14 +; VI-XNACK: ; TotalNumSgprs: 18 ; HSA-VI-XNACK: ; TotalNumSgprs: 24 -; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 -; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10 -; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"() @@ -122,15 +122,15 @@ entry: ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: NumSgprs: 4 -; VI-NOXNACK: NumSgprs: 6 +; CI: NumSgprs: 16 +; VI-NOXNACK: NumSgprs: 18 ; HSA-VI-NOXNACK: NumSgprs: 24 -; VI-XNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 18 ; HSA-VI-XNACK: NumSgprs: 24 -; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6 -; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0 -; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11 define amdgpu_kernel void @use_flat_scr() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch}"() @@ -143,15 +143,15 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: NumSgprs: 4 -; VI-NOXNACK: NumSgprs: 6 +; CI: NumSgprs: 16 +; VI-NOXNACK: NumSgprs: 18 ; HSA-VI-NOXNACK: NumSgprs: 24 -; VI-XNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 18 ; HSA-VI-XNACK: NumSgprs: 24 -; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6 -; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0 -; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11 define amdgpu_kernel void @use_flat_scr_lo() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"() @@ -166,15 +166,15 @@ entry: ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: NumSgprs: 4 -; VI-NOXNACK: NumSgprs: 6 +; CI: NumSgprs: 16 +; VI-NOXNACK: NumSgprs: 18 ; HSA-VI-NOXNACK: NumSgprs: 24 -; VI-XNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 18 ; HSA-VI-XNACK: NumSgprs: 24 -; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6 -; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0 -; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11 define amdgpu_kernel void @use_flat_scr_hi() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"() diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll index e152f2ddd5253..0a6aa05c2d212 100644 --- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll @@ -5,14 +5,14 @@ ; GCN-LABEL: {{^}}use_vcc: ; GCN: .set use_vcc.num_vgpr, 0 ; GCN: .set use_vcc.num_agpr, 0 -; GCN: .set use_vcc.numbered_sgpr, 32 +; GCN: .set use_vcc.numbered_sgpr, 0 ; GCN: .set use_vcc.private_seg_size, 0 ; GCN: .set use_vcc.uses_vcc, 1 ; GCN: .set use_vcc.uses_flat_scratch, 0 ; GCN: .set use_vcc.has_dyn_sized_stack, 0 ; GCN: .set use_vcc.has_recursion, 0 ; GCN: .set use_vcc.has_indirect_call, 0 -; GCN: TotalNumSgprs: 36 +; GCN: TotalNumSgprs: 4 ; GCN: NumVgprs: 0 ; GCN: ScratchSize: 0 define void @use_vcc() #1 { @@ -59,14 +59,14 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) ; GCN-LABEL: {{^}}use_flat_scratch: ; GCN: .set use_flat_scratch.num_vgpr, 0 ; GCN: .set use_flat_scratch.num_agpr, 0 -; GCN: .set use_flat_scratch.numbered_sgpr, 32 +; GCN: .set use_flat_scratch.numbered_sgpr, 0 ; GCN: .set use_flat_scratch.private_seg_size, 0 ; GCN: .set use_flat_scratch.uses_vcc, 0 ; GCN: .set use_flat_scratch.uses_flat_scratch, 1 ; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0 ; GCN: .set use_flat_scratch.has_recursion, 0 ; GCN: .set use_flat_scratch.has_indirect_call, 0 -; GCN: TotalNumSgprs: 38 +; GCN: TotalNumSgprs: 6 ; GCN: NumVgprs: 0 ; GCN: ScratchSize: 0 define void @use_flat_scratch() #1 { @@ -113,14 +113,14 @@ define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace ; GCN-LABEL: {{^}}use_10_vgpr: ; GCN: .set use_10_vgpr.num_vgpr, 10 ; GCN: .set use_10_vgpr.num_agpr, 0 -; GCN: .set use_10_vgpr.numbered_sgpr, 32 +; GCN: .set use_10_vgpr.numbered_sgpr, 0 ; GCN: .set use_10_vgpr.private_seg_size, 0 ; GCN: .set use_10_vgpr.uses_vcc, 0 ; GCN: .set use_10_vgpr.uses_flat_scratch, 0 ; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0 ; GCN: .set use_10_vgpr.has_recursion, 0 ; GCN: .set use_10_vgpr.has_indirect_call, 0 -; GCN: TotalNumSgprs: 36 +; GCN: TotalNumSgprs: 4 ; GCN: NumVgprs: 10 ; GCN: ScratchSize: 0 define void @use_10_vgpr() #1 { @@ -168,14 +168,14 @@ define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 { ; GCN-LABEL: {{^}}use_50_vgpr: ; GCN: .set use_50_vgpr.num_vgpr, 50 ; GCN: .set use_50_vgpr.num_agpr, 0 -; GCN: .set use_50_vgpr.numbered_sgpr, 32 +; GCN: .set use_50_vgpr.numbered_sgpr, 0 ; GCN: .set use_50_vgpr.private_seg_size, 0 ; GCN: .set use_50_vgpr.uses_vcc, 0 ; GCN: .set use_50_vgpr.uses_flat_scratch, 0 ; GCN: .set use_50_vgpr.has_dyn_sized_stack, 0 ; GCN: .set use_50_vgpr.has_recursion, 0 ; GCN: .set use_50_vgpr.has_indirect_call, 0 -; GCN: TotalNumSgprs: 36 +; GCN: TotalNumSgprs: 4 ; GCN: NumVgprs: 50 ; GCN: ScratchSize: 0 define void @use_50_vgpr() #1 { @@ -258,14 +258,14 @@ define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 { ; GCN-LABEL: {{^}}use_stack0: ; GCN: .set use_stack0.num_vgpr, 1 ; GCN: .set use_stack0.num_agpr, 0 -; GCN: .set use_stack0.numbered_sgpr, 33 +; GCN: .set use_stack0.numbered_sgpr, 0 ; GCN: .set use_stack0.private_seg_size, 2052 ; GCN: .set use_stack0.uses_vcc, 0 ; GCN: .set use_stack0.uses_flat_scratch, 0 ; GCN: .set use_stack0.has_dyn_sized_stack, 0 ; GCN: .set use_stack0.has_recursion, 0 ; GCN: .set use_stack0.has_indirect_call, 0 -; GCN: TotalNumSgprs: 37 +; GCN: TotalNumSgprs: 4 ; GCN: NumVgprs: 1 ; GCN: ScratchSize: 2052 define void @use_stack0() #1 { @@ -277,14 +277,14 @@ define void @use_stack0() #1 { ; GCN-LABEL: {{^}}use_stack1: ; GCN: .set use_stack1.num_vgpr, 1 ; GCN: .set use_stack1.num_agpr, 0 -; GCN: .set use_stack1.numbered_sgpr, 33 +; GCN: .set use_stack1.numbered_sgpr, 0 ; GCN: .set use_stack1.private_seg_size, 404 ; GCN: .set use_stack1.uses_vcc, 0 ; GCN: .set use_stack1.uses_flat_scratch, 0 ; GCN: .set use_stack1.has_dyn_sized_stack, 0 ; GCN: .set use_stack1.has_recursion, 0 ; GCN: .set use_stack1.has_indirect_call, 0 -; GCN: TotalNumSgprs: 37 +; GCN: TotalNumSgprs: 4 ; GCN: NumVgprs: 1 ; GCN: ScratchSize: 404 define void @use_stack1() #1 { diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index cd89a36fe538b..bf452a9e38e01 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -15,7 +15,7 @@ ; CHECK: .max_flat_workgroup_size: 1024 ; CHECK: .name: test ; CHECK: .private_segment_fixed_size: 0 -; CHECK: .sgpr_count: 10 +; CHECK: .sgpr_count: 16 ; CHECK: .symbol: test.kd ; CHECK: .vgpr_count: {{3|6}} ; WAVE64: .wavefront_size: 64 diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index 024593c49dba1..f7e3498907005 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -63,7 +63,7 @@ ; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7 ; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E ; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76 -; ELF: 0250: 6770725F 636F756E 7402B12E 76677072 +; ELF: 0250: 6770725F 636F756E 7401B12E 76677072 ; ELF: 0260: 5F737069 6C6C5F63 6F756E74 00AF2E77 ; ELF: 0270: 61766566 726F6E74 5F73697A 6540AD61 ; ELF: 0280: 6D646873 612E7461 72676574 BD616D64 diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll new file mode 100644 index 0000000000000..45de8a79fe88d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll @@ -0,0 +1,72 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Use VGPRs above the input arguments. +; CHECK-LABEL: _miss_1: +; CHECK: .vgpr_count:{{.*}}0x1d{{$}} + +define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count, + i32 %vcr, { i32 } %system.data, + i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7, + i32 %inactive.vgpr8, i32 %inactive.vgpr9) + local_unnamed_addr { +entry: + %system.data.value = extractvalue { i32 } %system.data, 0 + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %shader, label %tail + +shader: + %system.data.extract = extractvalue { i32 } %system.data, 0 + %data.mul = mul i32 %system.data.extract, 2 + %data.add = add i32 %data.mul, 1 + call void asm sideeffect "; clobber v28", "~{v28}"() + br label %tail + +tail: + %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ] + %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ] + %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ] + %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ] + %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ] + %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ] + %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ] + %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ] + %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ] + %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ] + %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ] + %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ] + + %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0 + %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1 + %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2 + %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3 + %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4 + %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5 + %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6 + %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7 + %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8 + %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9 + %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10 + %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11 + + %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0 + %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1 + %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2 + %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3 + + call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...) + @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s( + ptr %next.callee, i32 0, <4 x i32> inreg %final.vec, + { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct, + i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32) + unreachable +} + +declare i32 @llvm.amdgcn.dead.i32() +declare i1 @llvm.amdgcn.init.whole.wave() +declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...) + +declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg) diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll new file mode 100644 index 0000000000000..9c636d4516a80 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll @@ -0,0 +1,46 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers. +; CHECK-LABEL: leaf_shader: +; CHECK: .vgpr_count:{{.*}}0x1{{$}} + +; Function without calls. +define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value, + i32 %active.vgpr1, i32 %active.vgpr2, + i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6) + local_unnamed_addr { +entry: + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %compute, label %merge + +compute: + ; Perform a more complex computation using active VGPRs + %square = mul i32 %active.vgpr1, %active.vgpr1 + %product = mul i32 %square, %active.vgpr2 + %sum = add i32 %product, %input.value + %result = add i32 %sum, 42 + br label %merge + +merge: + %final.result = phi i32 [ 0, %entry ], [ %result, %compute ] + %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ] + %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ] + %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ] + %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ] + %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ] + %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ] + + store i32 %final.result, ptr %output.ptr, align 4 + + ret void +} + +declare i32 @llvm.amdgcn.dead.i32() +declare i1 @llvm.amdgcn.init.whole.wave() +declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...) + +declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg) diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll new file mode 100644 index 0000000000000..1b0d33cec7052 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll @@ -0,0 +1,74 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers. +; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes. +; In that case, the VGPR should be included in the .vgpr_count +; CHECK-LABEL: _miss_1: +; CHECK: .vgpr_count:{{.*}}0xd{{$}} + +define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count, + i32 %vcr, { i32 } %system.data, + i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7, + i32 %inactive.vgpr8, i32 %inactive.vgpr9) + local_unnamed_addr { +entry: + %system.data.value = extractvalue { i32 } %system.data, 0 + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %shader, label %tail + +shader: + %system.data.extract = extractvalue { i32 } %system.data, 0 + %data.mul = mul i32 %system.data.extract, 2 + %data.add = add i32 %data.mul, 1 + call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"() + br label %tail + +tail: + %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ] + %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ] + %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ] + %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ] + %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ] + %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ] + %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ] + %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ] + %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ] + %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ] + %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ] + %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ] + + %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0 + %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1 + %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2 + %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3 + %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4 + %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5 + %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6 + %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7 + %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8 + %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9 + %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10 + %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11 + + %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0 + %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1 + %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2 + %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3 + + call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...) + @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s( + ptr %next.callee, i32 0, <4 x i32> inreg %final.vec, + { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct, + i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32) + unreachable +} + +declare i32 @llvm.amdgcn.dead.i32() +declare i1 @llvm.amdgcn.init.whole.wave() +declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...) + +declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg) diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll new file mode 100644 index 0000000000000..9408501718784 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll @@ -0,0 +1,71 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers. +; CHECK-LABEL: _miss_1: +; CHECK: .vgpr_count:{{.*}}0xa{{$}} + +define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count, + i32 %vcr, { i32 } %system.data, + i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7, + i32 %inactive.vgpr8, i32 %inactive.vgpr9) + local_unnamed_addr { +entry: + %system.data.value = extractvalue { i32 } %system.data, 0 + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %shader, label %tail + +shader: + %system.data.extract = extractvalue { i32 } %system.data, 0 + %data.mul = mul i32 %system.data.extract, 2 + %data.add = add i32 %data.mul, 1 + br label %tail + +tail: + %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ] + %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ] + %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ] + %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ] + %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ] + %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ] + %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ] + %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ] + %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ] + %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ] + %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ] + %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ] + + %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0 + %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1 + %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2 + %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3 + %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4 + %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5 + %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6 + %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7 + %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8 + %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9 + %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10 + %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11 + + %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0 + %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1 + %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2 + %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3 + + call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...) + @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s( + ptr %next.callee, i32 0, <4 x i32> inreg %final.vec, + { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct, + i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32) + unreachable +} + +declare i32 @llvm.amdgcn.dead.i32() +declare i1 @llvm.amdgcn.init.whole.wave() +declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...) + +declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg) diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll index 464cd820028cc..c3b033113431f 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -64,7 +64,7 @@ define void @func_regular_call() #1 { ; GCN-NEXT: s_addc_u32 s17, ; GCN-NEXT: s_setpc_b64 s[16:17] -; GCN: ; TotalNumSgprs: 32 +; GCN: ; TotalNumSgprs: 18 ; GCN: ; NumVgprs: 8 define void @func_tail_call() #1 { tail call void @func() diff --git a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll index 60bbf4646ee03..03694b913d6e0 100644 --- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll +++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll @@ -24,7 +24,7 @@ define void @I_Quit() { ; CHECK-LABEL: P_RemoveMobj: ; CHECK: .set P_RemoveMobj.num_vgpr, 0 ; CHECK: .set P_RemoveMobj.num_agpr, 0 -; CHECK: .set P_RemoveMobj.numbered_sgpr, 32 +; CHECK: .set P_RemoveMobj.numbered_sgpr, 0 ; CHECK: .set P_RemoveMobj.private_seg_size, 0 ; CHECK: .set P_RemoveMobj.uses_vcc, 0 ; CHECK: .set P_RemoveMobj.uses_flat_scratch, 0 @@ -38,7 +38,7 @@ define void @P_RemoveMobj() { ; CHECK-LABEL: P_SpawnMobj: ; CHECK: .set P_SpawnMobj.num_vgpr, 0 ; CHECK: .set P_SpawnMobj.num_agpr, 0 -; CHECK: .set P_SpawnMobj.numbered_sgpr, 32 +; CHECK: .set P_SpawnMobj.numbered_sgpr, 0 ; CHECK: .set P_SpawnMobj.private_seg_size, 0 ; CHECK: .set P_SpawnMobj.uses_vcc, 0 ; CHECK: .set P_SpawnMobj.uses_flat_scratch, 0 @@ -52,7 +52,7 @@ define void @P_SpawnMobj() { ; CHECK-LABEL: G_PlayerReborn: ; CHECK: .set G_PlayerReborn.num_vgpr, 0 ; CHECK: .set G_PlayerReborn.num_agpr, 0 -; CHECK: .set G_PlayerReborn.numbered_sgpr, 32 +; CHECK: .set G_PlayerReborn.numbered_sgpr, 0 ; CHECK: .set G_PlayerReborn.private_seg_size, 0 ; CHECK: .set G_PlayerReborn.uses_vcc, 0 ; CHECK: .set G_PlayerReborn.uses_flat_scratch, 0 @@ -66,7 +66,7 @@ define void @G_PlayerReborn() { ; CHECK-LABEL: P_SetThingPosition: ; CHECK: .set P_SetThingPosition.num_vgpr, 0 ; CHECK: .set P_SetThingPosition.num_agpr, 0 -; CHECK: .set P_SetThingPosition.numbered_sgpr, 32 +; CHECK: .set P_SetThingPosition.numbered_sgpr, 0 ; CHECK: .set P_SetThingPosition.private_seg_size, 0 ; CHECK: .set P_SetThingPosition.uses_vcc, 0 ; CHECK: .set P_SetThingPosition.uses_flat_scratch, 0 @@ -96,7 +96,7 @@ define void @P_SetupPsprites(ptr addrspace(1) %i) { ; CHECK-LABEL: HU_Start: ; CHECK: .set HU_Start.num_vgpr, 0 ; CHECK: .set HU_Start.num_agpr, 0 -; CHECK: .set HU_Start.numbered_sgpr, 32 +; CHECK: .set HU_Start.numbered_sgpr, 0 ; CHECK: .set HU_Start.private_seg_size, 0 ; CHECK: .set HU_Start.uses_vcc, 0 ; CHECK: .set HU_Start.uses_flat_scratch, 0 @@ -162,7 +162,7 @@ define void @G_DoReborn() { ; CHECK-LABEL: AM_Stop: ; CHECK: .set AM_Stop.num_vgpr, 0 ; CHECK: .set AM_Stop.num_agpr, 0 -; CHECK: .set AM_Stop.numbered_sgpr, 32 +; CHECK: .set AM_Stop.numbered_sgpr, 0 ; CHECK: .set AM_Stop.private_seg_size, 0 ; CHECK: .set AM_Stop.uses_vcc, 0 ; CHECK: .set AM_Stop.uses_flat_scratch, 0 @@ -176,7 +176,7 @@ define void @AM_Stop() { ; CHECK-LABEL: D_AdvanceDemo: ; CHECK: .set D_AdvanceDemo.num_vgpr, 0 ; CHECK: .set D_AdvanceDemo.num_agpr, 0 -; CHECK: .set D_AdvanceDemo.numbered_sgpr, 32 +; CHECK: .set D_AdvanceDemo.numbered_sgpr, 0 ; CHECK: .set D_AdvanceDemo.private_seg_size, 0 ; CHECK: .set D_AdvanceDemo.uses_vcc, 0 ; CHECK: .set D_AdvanceDemo.uses_flat_scratch, 0 @@ -190,7 +190,7 @@ define void @D_AdvanceDemo() { ; CHECK-LABEL: F_StartFinale: ; CHECK: .set F_StartFinale.num_vgpr, 0 ; CHECK: .set F_StartFinale.num_agpr, 0 -; CHECK: .set F_StartFinale.numbered_sgpr, 32 +; CHECK: .set F_StartFinale.numbered_sgpr, 0 ; CHECK: .set F_StartFinale.private_seg_size, 0 ; CHECK: .set F_StartFinale.uses_vcc, 0 ; CHECK: .set F_StartFinale.uses_flat_scratch, 0 @@ -204,7 +204,7 @@ define void @F_StartFinale() { ; CHECK-LABEL: F_Ticker: ; CHECK: .set F_Ticker.num_vgpr, 0 ; CHECK: .set F_Ticker.num_agpr, 0 -; CHECK: .set F_Ticker.numbered_sgpr, 32 +; CHECK: .set F_Ticker.numbered_sgpr, 0 ; CHECK: .set F_Ticker.private_seg_size, 0 ; CHECK: .set F_Ticker.uses_vcc, 0 ; CHECK: .set F_Ticker.uses_flat_scratch, 0 @@ -236,7 +236,7 @@ define i32 @G_CheckDemoStatus() { ; CHECK-LABEL: P_TempSaveGameFile: ; CHECK: .set P_TempSaveGameFile.num_vgpr, 2 ; CHECK: .set P_TempSaveGameFile.num_agpr, 0 -; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 32 +; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 0 ; CHECK: .set P_TempSaveGameFile.private_seg_size, 0 ; CHECK: .set P_TempSaveGameFile.uses_vcc, 0 ; CHECK: .set P_TempSaveGameFile.uses_flat_scratch, 0 @@ -250,7 +250,7 @@ define ptr @P_TempSaveGameFile() { ; CHECK-LABEL: P_SaveGameFile: ; CHECK: .set P_SaveGameFile.num_vgpr, 2 ; CHECK: .set P_SaveGameFile.num_agpr, 0 -; CHECK: .set P_SaveGameFile.numbered_sgpr, 32 +; CHECK: .set P_SaveGameFile.numbered_sgpr, 0 ; CHECK: .set P_SaveGameFile.private_seg_size, 0 ; CHECK: .set P_SaveGameFile.uses_vcc, 0 ; CHECK: .set P_SaveGameFile.uses_flat_scratch, 0 @@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() { ; CHECK-LABEL: R_FlatNumForName: ; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr) ; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr) -; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr) +; CHECK: .set R_FlatNumForName.numbered_sgpr, max(34, I_Error.numbered_sgpr) ; CHECK: .set R_FlatNumForName.private_seg_size, 16+max(I_Error.private_seg_size) ; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc) ; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch) diff --git a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll index 7a810d0067c17..83f58db1aa67f 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: {{^}}qux ; CHECK: .set qux.num_vgpr, 13 ; CHECK: .set qux.num_agpr, 0 -; CHECK: .set qux.numbered_sgpr, 32 +; CHECK: .set qux.numbered_sgpr, 0 ; CHECK: .set qux.private_seg_size, 0 ; CHECK: .set qux.uses_vcc, 0 ; CHECK: .set qux.uses_flat_scratch, 0 diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll index 638dc8965987e..28c3131302a31 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll @@ -83,13 +83,13 @@ ; CHECK-NEXT: multiple_stack: ; CHECK-NEXT: .backend_stack_size: 0x24 ; CHECK-NEXT: .lds_size: 0 -; CHECK-NEXT: .sgpr_count: 0x21 +; CHECK-NEXT: .sgpr_count: 0x1 ; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24 ; CHECK-NEXT: .vgpr_count: 0x3 ; CHECK-NEXT: no_stack: ; CHECK-NEXT: .backend_stack_size: 0 ; CHECK-NEXT: .lds_size: 0 -; CHECK-NEXT: .sgpr_count: 0x20 +; CHECK-NEXT: .sgpr_count: 0x1 ; CHECK-NEXT: .stack_frame_size_in_bytes: 0 ; CHECK-NEXT: .vgpr_count: 0x1 ; CHECK-NEXT: no_stack_call: @@ -122,7 +122,7 @@ ; CHECK-NEXT: simple_lds: ; CHECK-NEXT: .backend_stack_size: 0 ; CHECK-NEXT: .lds_size: 0x100 -; CHECK-NEXT: .sgpr_count: 0x20 +; CHECK-NEXT: .sgpr_count: 0x1 ; CHECK-NEXT: .stack_frame_size_in_bytes: 0 ; CHECK-NEXT: .vgpr_count: 0x1 ; CHECK-NEXT: simple_lds_recurse: @@ -134,7 +134,7 @@ ; CHECK-NEXT: simple_stack: ; CHECK-NEXT: .backend_stack_size: 0x14 ; CHECK-NEXT: .lds_size: 0 -; CHECK-NEXT: .sgpr_count: 0x21 +; CHECK-NEXT: .sgpr_count: 0x1 ; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14 ; CHECK-NEXT: .vgpr_count: 0x2 ; CHECK-NEXT: simple_stack_call: diff --git a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll index 5b9b0feea9900..a71fd7fe782ff 100644 --- a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll +++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll @@ -2,7 +2,7 @@ ;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK ; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg: -; ;CHECK: NumVgprs: 4 +; ;CHECK: NumVgprs: 2 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 { .entry: %i1 = extractelement <2 x float> %arg3, i32 1 @@ -193,7 +193,7 @@ define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused: -; CHECK: NumVgprs: 4 +; CHECK: NumVgprs: 2 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 { .entry: ret { <4 x float> } undef @@ -202,7 +202,7 @@ define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input ; Additionally set the PSInputAddr to 0 via the metadata ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0: -; CHECK: NumVgprs: 4 +; CHECK: NumVgprs: 2 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 { .entry: ret { <4 x float> } undef diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll index 35e11ad6a648b..bfcf90037bfd3 100644 --- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll +++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll @@ -24,7 +24,9 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) % ; SI-LABEL: {{^}}one_vgpr_used: ; SI: NumVgprs: 1 -define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) nounwind { +define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) #0 { store i32 %x, ptr addrspace(1) %out, align 4 ret void } + +attributes #0 = { nounwind noinline "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll index afb77ed190896..a2470a60cb19f 100644 --- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll +++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll @@ -122,8 +122,8 @@ define void @test_func() !dbg !6 { } ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel -; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 4 -; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0 +; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 22 +; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 3 ; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0 ; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0 ; STDERR-NEXT: remark: foo.cl:8:0: Dynamic Stack: False diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll index 0d25bc97ff775..557ffd27a07f6 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll @@ -4,8 +4,8 @@ ; CHECK-LABEL: {{^}}spill: ; GCN: NumSgprs: 104 ; GCN-GCNTRACKERS: NumSgprs: 104 -; GCN: NumVgprs: 1 -; GCN-GCNTRACKERS: NumVgprs: 2 +; GCN: NumVgprs: 3 +; GCN-GCNTRACKERS: NumVgprs: 3 ; GCN: ScratchSize: 0 ; GCN-GCNTRACKERS: ScratchSize: 0 ; GCN: Occupancy: 5 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index c5732531f5423..95d707aee5662 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -11,8 +11,8 @@ ; allow scheduling of other instructions which reduce RP ; CHECK-LABEL: {{^}}return_72xi32: -; GFX11-PAL: NumSgprs: 33 -; GFX11-PAL-GCNTRACKERS: NumSgprs: 33 +; GFX11-PAL: NumSgprs: 0 +; GFX11-PAL-GCNTRACKERS: NumSgprs: 0 ; GFX11-PAL: NumVgprs: 64 ; GFX11-PAL-GCNTRACKERS: NumVgprs: 64 ; GFX11-PAL: ScratchSize: 220 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll index 462ac23ec7e0e..8300a52955b91 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -7,14 +7,14 @@ ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s -; SI-MINREG: NumSgprs: {{[1-9]$}} -; SI-MINREG: NumVgprs: {{[1-9]$}} +; SI-MINREG: NumSgprs: {{[1]?[1-9]$}} +; SI-MINREG: NumVgprs: {{[1]?[1-9]$}} ; SI-MAXOCC: NumSgprs: {{[1-4]?[0-9]$}} ; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}} ; stores may alias loads -; VI-MINREG: NumSgprs: {{[0-9]$}} +; VI-MINREG: NumSgprs: {{[1]?[0-9]$}} ; VI-MINREG: NumVgprs: {{[1-3][0-9]$}} ; stores may alias loads diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll index 6ddf0986755f9..682bbdedb37a3 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -35,7 +35,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 -; VI-NEXT: .amdhsa_next_free_vgpr 1 +; VI-NEXT: .amdhsa_next_free_vgpr 3 ; VI-NEXT: .amdhsa_next_free_sgpr 18 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 @@ -86,7 +86,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 -; GFX9-NEXT: .amdhsa_next_free_vgpr 1 +; GFX9-NEXT: .amdhsa_next_free_vgpr 3 ; GFX9-NEXT: .amdhsa_next_free_sgpr 18 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 @@ -146,7 +146,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 -; VI-NEXT: .amdhsa_next_free_vgpr 1 +; VI-NEXT: .amdhsa_next_free_vgpr 3 ; VI-NEXT: .amdhsa_next_free_sgpr 18 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 @@ -197,7 +197,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 -; GFX9-NEXT: .amdhsa_next_free_vgpr 1 +; GFX9-NEXT: .amdhsa_next_free_vgpr 3 ; GFX9-NEXT: .amdhsa_next_free_sgpr 18 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 @@ -257,7 +257,7 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 -; VI-NEXT: .amdhsa_next_free_vgpr 1 +; VI-NEXT: .amdhsa_next_free_vgpr 3 ; VI-NEXT: .amdhsa_next_free_sgpr 18 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 @@ -308,7 +308,7 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 -; GFX9-NEXT: .amdhsa_next_free_vgpr 1 +; GFX9-NEXT: .amdhsa_next_free_vgpr 3 ; GFX9-NEXT: .amdhsa_next_free_sgpr 18 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll index 30accc846d2b6..d3def45c4f9d2 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @kern() #0 { ; ASM-LABEL: kern: -; ASM: .amdhsa_next_free_sgpr 5 +; ASM: .amdhsa_next_free_sgpr 8 ; ASM: .amdhsa_reserve_xnack_mask 1 ; Verify that an extra SGPR block is reserved with XNACK "any" tid setting. @@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!....... ; ELF: AMDGPU Metadata -; ELF: .sgpr_count: 9 +; ELF: .sgpr_count: 12 entry: tail call void asm sideeffect "", "~{s[0:4]}"() ret void diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll index 4f84b31f1877b..ad831e040d722 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @kern() #0 { ; ASM-LABEL: kern: -; ASM: .amdhsa_next_free_sgpr 5 +; ASM: .amdhsa_next_free_sgpr 8 ; ASM: .amdhsa_reserve_xnack_mask 0 ; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting. @@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!....... ; ELF: AMDGPU Metadata -; ELF: .sgpr_count: 5 +; ELF: .sgpr_count: 8 entry: tail call void asm sideeffect "", "~{s[0:4]}"() ret void diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll index 644f434923368..d1e28e11601ce 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @kern() #0 { ; ASM-LABEL: kern: -; ASM: .amdhsa_next_free_sgpr 5 +; ASM: .amdhsa_next_free_sgpr 8 ; ASM: .amdhsa_reserve_xnack_mask 1 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting. @@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!....... ; ELF: AMDGPU Metadata -; ELF: .sgpr_count: 9 +; ELF: .sgpr_count: 12 entry: tail call void asm sideeffect "", "~{s[0:4]}"() ret void diff --git a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll index cf5b95a729974..4802ec861d685 100644 --- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll +++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: __unnamed_1: ; CHECK: .set __unnamed_1.num_vgpr, 0 ; CHECK: .set __unnamed_1.num_agpr, 0 -; CHECK: .set __unnamed_1.numbered_sgpr, 32 +; CHECK: .set __unnamed_1.numbered_sgpr, 0 ; CHECK: .set __unnamed_1.private_seg_size, 0 ; CHECK: .set __unnamed_1.uses_vcc, 0 ; CHECK: .set __unnamed_1.uses_flat_scratch, 0 @@ -16,7 +16,7 @@ entry: } ; CHECK-LABEL: __unnamed_2: -; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr) +; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr) ; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr) ; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr) ; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size) diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll index 2cb5e309c8c21..ee35dc4cddade 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll @@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 { } ; GCN-LABEL: {{^}}f1024_0: -; GFX90A: NumVgprs: 32 +; GFX90A: NumVgprs: 1 ; GFX90A: NumAgprs: 1 -; GFX90A: TotalNumVgprs: 33 +; GFX90A: TotalNumVgprs: 5 define void @f1024_0() #1024 { call void @foo() ret void diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll new file mode 100644 index 0000000000000..8c8182db7b479 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll @@ -0,0 +1,30 @@ +; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED +; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED +target triple = "amdgcn-amd-amdhsa" + +@global = addrspace(1) global i32 poison, align 4 + +; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR. +; Only hardware-initialized VGPRs (v0) are read in this kernel. + +; CHECK-LABEL: amdhsa.kernels: +; CHECK-LABEL: kernel_x +; CHECK: .vgpr_count: 1 +define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() + call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + ret void +} + +; CHECK-LABEL: kernel_z +; PACKED: .vgpr_count: 1 +; NOTPACKED: .vgpr_count: 3 +define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.z() + call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + ret void +} + +attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll new file mode 100644 index 0000000000000..f5d28a0ae1628 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll @@ -0,0 +1,35 @@ +; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s +; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count. +target triple = "amdgcn--amdpal" + +@global = addrspace(1) global i32 poison, align 4 + +; CHECK-LABEL: amdpal.pipelines: + +; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used. +; CHECK-LABEL: .entry_point_symbol: kernel_use +; CHECK: .vgpr_count: 0x20 +define amdgpu_cs void @kernel_use([32 x i32] %args) { +entry: + %a = extractvalue [32 x i32] %args, 14 + store i32 %a, ptr addrspace(1) @global + ret void +} + +; Neither uses not writes a VGPR +; CHECK-LABEL: chain_func: +; CHECK: .vgpr_count: 0x1 +define amdgpu_cs_chain void @chain_func([32 x i32] %args) { +entry: + call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a( + ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0) + unreachable +} + +; Neither uses not writes a VGPR +; CHECK-LABEL: gfx_func: +; CHECK: .vgpr_count: 0x1 +define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) { +entry: + ret [32 x i32] %args +}