From 45c8fe1b5c7001d18e50e50154d2c2592bbdf3f1 Mon Sep 17 00:00:00 2001 From: guochen2 Date: Wed, 26 Feb 2025 13:09:03 -0500 Subject: [PATCH 1/3] insertwaitcnt pass update for true16 --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++++++++++++--------- llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 13 +++++++++ llvm/test/CodeGen/AMDGPU/spillv16.ll | 2 +- 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 13e01b53639d2..ffabaa6ba4107 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -130,10 +130,10 @@ enum WaitEventType { // We reserve a fixed number of VGPR slots in the scoring tables for // special tokens like SCMEM_LDS (needed for buffer load to LDS). enum RegisterMapping { - SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. - AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets. - SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets. - NUM_EXTRA_VGPRS = 9, // Reserved slots for DS. + SQ_MAX_PGM_VGPRS = 1024, // Maximum programmable VGPRs across all targets. + AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets. + SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets. + NUM_EXTRA_VGPRS = 9, // Reserved slots for DS. // Artificial register slots to track LDS writes into specific LDS locations // if a location is known. When slots are exhausted or location is // unknown use the first slot. The first slot is also always updated in @@ -748,27 +748,33 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, RegInterval Result; - unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) & - AMDGPU::HWEncoding::REG_IDX_MASK; + MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST); + unsigned RegIdx = TRI->getHWRegIndex(MCReg); + assert(isUInt<8>(RegIdx)); + unsigned Reg = (RegIdx << 1) | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0); + const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); + unsigned Size = TRI->getRegSizeInBits(*RC); + + // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits if (TRI->isVectorRegister(*MRI, Op.getReg())) { assert(Reg <= SQ_MAX_PGM_VGPRS); Result.first = Reg; if (TRI->isAGPR(*MRI, Op.getReg())) Result.first += AGPR_OFFSET; assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); - } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && Reg < SQ_MAX_PGM_SGPRS) { + assert(Size % 16 == 0); + Result.second = Result.first + (Size / 16); + } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && + (Reg >> 1) < SQ_MAX_PGM_SGPRS) { // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar // sources like SRC_PRIVATE_BASE. - Result.first = Reg + NUM_ALL_VGPRS; + Result.first = (Reg >> 1) + NUM_ALL_VGPRS; + Result.second = Result.first + ((Size + 16) / 32); } else { return {-1, -1}; } - const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); - unsigned Size = TRI->getRegSizeInBits(*RC); - Result.second = Result.first + ((Size + 16) / 32); - return Result; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index a64180daea2ad..785f019687b55 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -295,8 +295,21 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const; + bool isVGPR(MCRegister Reg) const { + const TargetRegisterClass *RC = getPhysRegBaseClass(Reg); + // Registers without classes are unaddressable, SGPR-like registers. + return RC && isVGPRClass(RC); + } bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const; + bool isAGPR(MCRegister Reg) const { + const TargetRegisterClass *RC = getPhysRegBaseClass(Reg); + // Registers without classes are unaddressable, SGPR-like registers. + return RC && isAGPRClass(RC); + } bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const; + bool isVectorRegister(MCRegister Reg) const { + return isVGPR(Reg) || isAGPR(Reg); + } bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const { return isVGPR(MRI, Reg) || isAGPR(MRI, Reg); } diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll index 0e45df223465d..3d21860e2af40 100644 --- a/llvm/test/CodeGen/AMDGPU/spillv16.ll +++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll @@ -61,8 +61,8 @@ define void @spill_i16_alu_two_vals() { ; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc ; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GCN-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GCN-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 dlc ; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc From 3f3c575881d4c4374b40dcbc369b8111ec739750 Mon Sep 17 00:00:00 2001 From: guochen2 Date: Mon, 10 Mar 2025 13:11:18 -0400 Subject: [PATCH 2/3] address comment --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 11 +++++------ llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 13 ------------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ffabaa6ba4107..c31e7701cf5b8 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -751,26 +751,25 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST); unsigned RegIdx = TRI->getHWRegIndex(MCReg); assert(isUInt<8>(RegIdx)); - unsigned Reg = (RegIdx << 1) | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0); const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); unsigned Size = TRI->getRegSizeInBits(*RC); // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits if (TRI->isVectorRegister(*MRI, Op.getReg())) { - assert(Reg <= SQ_MAX_PGM_VGPRS); + unsigned Reg = (RegIdx << 1) | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0); + assert(Reg <= AGPR_OFFSET); Result.first = Reg; if (TRI->isAGPR(*MRI, Op.getReg())) Result.first += AGPR_OFFSET; assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); assert(Size % 16 == 0); Result.second = Result.first + (Size / 16); - } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && - (Reg >> 1) < SQ_MAX_PGM_SGPRS) { + } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) { // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar // sources like SRC_PRIVATE_BASE. - Result.first = (Reg >> 1) + NUM_ALL_VGPRS; - Result.second = Result.first + ((Size + 16) / 32); + Result.first = RegIdx + NUM_ALL_VGPRS; + Result.second = Result.first + divideCeil(Size, 32); } else { return {-1, -1}; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 785f019687b55..a64180daea2ad 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -295,21 +295,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const; - bool isVGPR(MCRegister Reg) const { - const TargetRegisterClass *RC = getPhysRegBaseClass(Reg); - // Registers without classes are unaddressable, SGPR-like registers. - return RC && isVGPRClass(RC); - } bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const; - bool isAGPR(MCRegister Reg) const { - const TargetRegisterClass *RC = getPhysRegBaseClass(Reg); - // Registers without classes are unaddressable, SGPR-like registers. - return RC && isAGPRClass(RC); - } bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const; - bool isVectorRegister(MCRegister Reg) const { - return isVGPR(Reg) || isAGPR(Reg); - } bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const { return isVGPR(MRI, Reg) || isAGPR(MRI, Reg); } From ca55566a9ddb802610c53d30394a8950ce25beba Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Tue, 11 Mar 2025 10:28:41 -0400 Subject: [PATCH 3/3] Update llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp Co-authored-by: Jay Foad --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index c31e7701cf5b8..239f2664f59f3 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -757,8 +757,8 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits if (TRI->isVectorRegister(*MRI, Op.getReg())) { - unsigned Reg = (RegIdx << 1) | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0); - assert(Reg <= AGPR_OFFSET); + unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0); + assert(Reg < AGPR_OFFSET); Result.first = Reg; if (TRI->isAGPR(*MRI, Op.getReg())) Result.first += AGPR_OFFSET;