Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// dispatch registers are function args.
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;

if (isShader(F.getCallingConv())) {
// Entry functions need to count input arguments even if they're not used
// (i.e. not reported by AMDGPUResourceUsageAnalysis). Other functions can
// skip including them. This is especially important for shaders that use the
// init.whole.wave intrinsic, since they sometimes have VGPR arguments that
// are only added for the purpose of preserving their inactive lanes and
// should not be included in the vgpr-count.
if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you fuse these into one function, I don't want to have to do the intersection to figure out the list of CCs this applies to

bool IsPixelShader =
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();

Expand Down
299 changes: 44 additions & 255 deletions llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,268 +139,56 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(

Info.UsesVCC =
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
/*IncludeCalls=*/false);
if (ST.hasMAIInsts())
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
/*IncludeCalls=*/false);

// If there are no calls, MachineRegisterInfo can tell us the used register
// count easily.
// A tail call isn't considered a call for MachineFrameInfo's purposes.
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
if (ST.hasMAIInsts())
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
bool HasCalls = FrameInfo.hasCalls() || FrameInfo.hasTailCall();
// Functions that use the llvm.amdgcn.init.whole.wave intrinsic often have
// VGPR arguments that are only added for the purpose of preserving the
// inactive lanes. These should not be included in the number of used VGPRs.
bool NeedsExplicitVGPRCount = MFI->hasInitWholeWave();
if (!HasCalls && !NeedsExplicitVGPRCount) {

Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
/*IncludeCalls=*/false);
return Info;
}

int32_t MaxVGPR = -1;
int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
Info.CalleeSegmentSize = 0;

for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
// TODO: Check regmasks? Do they occur anywhere except calls?
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
bool IsAGPR = false;

if (!MO.isReg())
continue;

Register Reg = MO.getReg();
switch (Reg) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing this huge switch is great but does it have to be part of this patch? Can it be a separate NFC thing?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, I did consider removing it in a separate patch, but then there would be no reason for the whole loop to exist. So I'd end up removing the loop in one patch and then re-adding the next, which felt kind of silly.

case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
case AMDGPU::EXEC_HI:
case AMDGPU::SCC:
case AMDGPU::M0:
case AMDGPU::M0_LO16:
case AMDGPU::M0_HI16:
case AMDGPU::SRC_SHARED_BASE_LO:
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_SHARED_LIMIT_LO:
case AMDGPU::SRC_SHARED_LIMIT:
case AMDGPU::SRC_PRIVATE_BASE_LO:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
case AMDGPU::SRC_PRIVATE_LIMIT:
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
case AMDGPU::SGPR_NULL:
case AMDGPU::SGPR_NULL64:
case AMDGPU::MODE:
continue;

case AMDGPU::NoRegister:
assert(MI.isDebugInstr() &&
"Instruction uses invalid noreg register");
continue;

case AMDGPU::VCC:
case AMDGPU::VCC_LO:
case AMDGPU::VCC_HI:
case AMDGPU::VCC_LO_LO16:
case AMDGPU::VCC_LO_HI16:
case AMDGPU::VCC_HI_LO16:
case AMDGPU::VCC_HI_HI16:
Info.UsesVCC = true;
continue;

case AMDGPU::FLAT_SCR:
case AMDGPU::FLAT_SCR_LO:
case AMDGPU::FLAT_SCR_HI:
continue;

case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
llvm_unreachable("xnack_mask registers should not be used");

case AMDGPU::LDS_DIRECT:
llvm_unreachable("lds_direct register should not be used");

case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
case AMDGPU::TMA:
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
llvm_unreachable("trap handler registers should not be used");

case AMDGPU::SRC_VCCZ:
llvm_unreachable("src_vccz register should not be used");

case AMDGPU::SRC_EXECZ:
llvm_unreachable("src_execz register should not be used");

case AMDGPU::SRC_SCC:
llvm_unreachable("src_scc register should not be used");

default:
break;
}

if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
IsSGPR = true;
Width = 1;
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
AMDGPU::VGPR_16RegClass.contains(Reg)) {
IsSGPR = false;
Width = 1;
} else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 1;
} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
IsSGPR = true;
Width = 2;
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
IsSGPR = false;
Width = 2;
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
Width = 3;
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
IsSGPR = true;
Width = 3;
} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 3;
} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
IsSGPR = true;
Width = 4;
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
IsSGPR = false;
Width = 4;
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 4;
} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
IsSGPR = false;
Width = 5;
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
IsSGPR = true;
Width = 5;
} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 5;
} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
IsSGPR = false;
Width = 6;
} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
IsSGPR = true;
Width = 6;
} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 6;
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
IsSGPR = false;
Width = 7;
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
IsSGPR = true;
Width = 7;
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 7;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
IsSGPR = true;
Width = 8;
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
IsSGPR = false;
Width = 8;
} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 8;
} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
IsSGPR = false;
Width = 9;
} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
IsSGPR = true;
Width = 9;
} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 9;
} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
IsSGPR = false;
Width = 10;
} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
IsSGPR = true;
Width = 10;
} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 10;
} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
IsSGPR = false;
Width = 11;
} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
IsSGPR = true;
Width = 11;
} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 11;
} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
IsSGPR = false;
Width = 12;
} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
IsSGPR = true;
Width = 12;
} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 12;
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
IsSGPR = true;
Width = 16;
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
IsSGPR = false;
Width = 16;
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 16;
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
IsSGPR = true;
Width = 32;
} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
Width = 32;
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 32;
} else {
// We only expect TTMP registers or registers that do not belong to
// any RC.
assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
AMDGPU::TTMP_64RegClass.contains(Reg) ||
AMDGPU::TTMP_128RegClass.contains(Reg) ||
AMDGPU::TTMP_256RegClass.contains(Reg) ||
AMDGPU::TTMP_512RegClass.contains(Reg) ||
!TRI.getPhysRegBaseClass(Reg)) &&
"Unknown register class");
}
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
if (IsSGPR) {
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
} else if (IsAGPR) {
MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
} else {
if (NeedsExplicitVGPRCount) {
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
const MachineOperand &MO = MI.getOperand(i);

if (!MO.isReg())
continue;
Register Reg = MO.getReg();
const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);

if (!RC || !TRI.isVGPRClass(RC))
continue;

// Skip inactive VGPRs in chain functions with the init.whole.wave
// intrinsic. These will only appear as implicit use operands on the
// chain call, and as the def of an IMPLICIT_DEF. We're going to skip
// implicit defs unconditionally though because if they're important
// in a different context then they will be counted when they are
// used.
bool IsChainCall =
MFI->isChainFunction() && MI.getOpcode() == AMDGPU::SI_TCRETURN;
if (IsChainCall || MI.isImplicitDef())
continue;

unsigned Width = TRI.getRegSizeInBits(*RC) / 32;
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
}
}
Expand Down Expand Up @@ -464,9 +252,10 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
}
}

Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
Info.NumAGPR = MaxAGPR + 1;
if (NeedsExplicitVGPRCount)
Info.NumVGPR = MaxVGPR + 1;
else
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass, false);

return Info;
}
8 changes: 4 additions & 4 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4046,11 +4046,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
return 0;
}

unsigned
SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const {
unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC,
bool IncludeCalls) const {
for (MCPhysReg Reg : reverse(RC.getRegisters()))
if (MRI.isPhysRegUsed(Reg))
if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
return getHWRegIndex(Reg) + 1;
return 0;
}
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -482,9 +482,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned SubReg) const;

// \returns a number of registers of a given \p RC used in a function.
// Does not go inside function calls.
// Does not go inside function calls. If \p IncludeCalls is true, it will
// include registers that may be clobbered by calls.
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const;
const TargetRegisterClass &RC,
bool IncludeCalls = true) const;

std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
Expand Down
Loading
Loading