Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// dispatch registers are function args.
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;

if (isShader(F.getCallingConv())) {
if (AMDGPU::shouldReportUnusedFuncArgs(F.getCallingConv())) {
bool IsPixelShader =
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();

Expand Down
299 changes: 44 additions & 255 deletions llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,268 +139,56 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(

Info.UsesVCC =
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
/*IncludeCalls=*/false);
if (ST.hasMAIInsts())
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
/*IncludeCalls=*/false);

// If there are no calls, MachineRegisterInfo can tell us the used register
// count easily.
// A tail call isn't considered a call for MachineFrameInfo's purposes.
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
if (ST.hasMAIInsts())
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
bool HasCalls = FrameInfo.hasCalls() || FrameInfo.hasTailCall();
// Functions that use the llvm.amdgcn.init.whole.wave intrinsic often have
// VGPR arguments that are only added for the purpose of preserving the
// inactive lanes. These should not be included in the number of used VGPRs.
bool NeedsExplicitVGPRCount = MFI->hasInitWholeWave();
if (!HasCalls && !NeedsExplicitVGPRCount) {

Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
/*IncludeCalls=*/false);
return Info;
}

int32_t MaxVGPR = -1;
int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
Info.CalleeSegmentSize = 0;

for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
// TODO: Check regmasks? Do they occur anywhere except calls?
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
bool IsAGPR = false;

if (!MO.isReg())
continue;

Register Reg = MO.getReg();
switch (Reg) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing this huge switch is great but does it have to be part of this patch? Can it be a separate NFC thing?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, I did consider removing it in a separate patch, but then there would be no reason for the whole loop to exist. So I'd end up removing the loop in one patch and then re-adding the next, which felt kind of silly.

case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
case AMDGPU::EXEC_HI:
case AMDGPU::SCC:
case AMDGPU::M0:
case AMDGPU::M0_LO16:
case AMDGPU::M0_HI16:
case AMDGPU::SRC_SHARED_BASE_LO:
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_SHARED_LIMIT_LO:
case AMDGPU::SRC_SHARED_LIMIT:
case AMDGPU::SRC_PRIVATE_BASE_LO:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
case AMDGPU::SRC_PRIVATE_LIMIT:
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
case AMDGPU::SGPR_NULL:
case AMDGPU::SGPR_NULL64:
case AMDGPU::MODE:
continue;

case AMDGPU::NoRegister:
assert(MI.isDebugInstr() &&
"Instruction uses invalid noreg register");
continue;

case AMDGPU::VCC:
case AMDGPU::VCC_LO:
case AMDGPU::VCC_HI:
case AMDGPU::VCC_LO_LO16:
case AMDGPU::VCC_LO_HI16:
case AMDGPU::VCC_HI_LO16:
case AMDGPU::VCC_HI_HI16:
Info.UsesVCC = true;
continue;

case AMDGPU::FLAT_SCR:
case AMDGPU::FLAT_SCR_LO:
case AMDGPU::FLAT_SCR_HI:
continue;

case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
llvm_unreachable("xnack_mask registers should not be used");

case AMDGPU::LDS_DIRECT:
llvm_unreachable("lds_direct register should not be used");

case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
case AMDGPU::TMA:
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
llvm_unreachable("trap handler registers should not be used");

case AMDGPU::SRC_VCCZ:
llvm_unreachable("src_vccz register should not be used");

case AMDGPU::SRC_EXECZ:
llvm_unreachable("src_execz register should not be used");

case AMDGPU::SRC_SCC:
llvm_unreachable("src_scc register should not be used");

default:
break;
}

if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
IsSGPR = true;
Width = 1;
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
AMDGPU::VGPR_16RegClass.contains(Reg)) {
IsSGPR = false;
Width = 1;
} else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 1;
} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
IsSGPR = true;
Width = 2;
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
IsSGPR = false;
Width = 2;
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
Width = 3;
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
IsSGPR = true;
Width = 3;
} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 3;
} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
IsSGPR = true;
Width = 4;
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
IsSGPR = false;
Width = 4;
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 4;
} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
IsSGPR = false;
Width = 5;
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
IsSGPR = true;
Width = 5;
} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 5;
} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
IsSGPR = false;
Width = 6;
} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
IsSGPR = true;
Width = 6;
} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 6;
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
IsSGPR = false;
Width = 7;
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
IsSGPR = true;
Width = 7;
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 7;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
IsSGPR = true;
Width = 8;
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
IsSGPR = false;
Width = 8;
} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 8;
} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
IsSGPR = false;
Width = 9;
} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
IsSGPR = true;
Width = 9;
} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 9;
} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
IsSGPR = false;
Width = 10;
} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
IsSGPR = true;
Width = 10;
} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 10;
} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
IsSGPR = false;
Width = 11;
} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
IsSGPR = true;
Width = 11;
} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 11;
} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
IsSGPR = false;
Width = 12;
} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
IsSGPR = true;
Width = 12;
} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 12;
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
IsSGPR = true;
Width = 16;
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
IsSGPR = false;
Width = 16;
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 16;
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
IsSGPR = true;
Width = 32;
} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
Width = 32;
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 32;
} else {
// We only expect TTMP registers or registers that do not belong to
// any RC.
assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
AMDGPU::TTMP_64RegClass.contains(Reg) ||
AMDGPU::TTMP_128RegClass.contains(Reg) ||
AMDGPU::TTMP_256RegClass.contains(Reg) ||
AMDGPU::TTMP_512RegClass.contains(Reg) ||
!TRI.getPhysRegBaseClass(Reg)) &&
"Unknown register class");
}
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
if (IsSGPR) {
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
} else if (IsAGPR) {
MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
} else {
if (NeedsExplicitVGPRCount) {
for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
const MachineOperand &MO = MI.getOperand(i);

if (!MO.isReg())
continue;
Register Reg = MO.getReg();
const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);

if (!RC || !TRI.isVGPRClass(RC))
continue;

// Skip inactive VGPRs in chain functions with the init.whole.wave
// intrinsic. These will only appear as implicit use operands on the
// chain call, and as the def of an IMPLICIT_DEF. We're going to skip
// implicit defs unconditionally though because if they're important
// in a different context then they will be counted when they are
// used.
bool IsChainCall =
MFI->isChainFunction() && MI.getOpcode() == AMDGPU::SI_TCRETURN;
if (IsChainCall || MI.isImplicitDef())
continue;

unsigned Width = TRI.getRegSizeInBits(*RC) / 32;
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
}
}
Expand Down Expand Up @@ -464,9 +252,10 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
}
}

Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
Info.NumAGPR = MaxAGPR + 1;
if (NeedsExplicitVGPRCount)
Info.NumVGPR = MaxVGPR + 1;
else
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass, false);

return Info;
}
8 changes: 4 additions & 4 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4046,11 +4046,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
return 0;
}

unsigned
SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const {
unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC,
bool IncludeCalls) const {
for (MCPhysReg Reg : reverse(RC.getRegisters()))
if (MRI.isPhysRegUsed(Reg))
if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
return getHWRegIndex(Reg) + 1;
return 0;
}
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -482,9 +482,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned SubReg) const;

// \returns a number of registers of a given \p RC used in a function.
// Does not go inside function calls.
// Does not go inside function calls. If \p IncludeCalls is true, it will
// include registers that may be clobbered by calls.
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const;
const TargetRegisterClass &RC,
bool IncludeCalls = true) const;

std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1351,6 +1351,28 @@ constexpr bool isEntryFunctionCC(CallingConv::ID CC) {
}
}

// Shaders that are entry functions need to count input arguments even if
// they're not used (i.e. not reported by AMDGPUResourceUsageAnalysis). Other
// functions can skip including them. This is especially important for shaders
// that use the init.whole.wave intrinsic, since they sometimes have VGPR
// arguments that are only added for the purpose of preserving their inactive
// lanes and should not be included in the vgpr-count.
LLVM_READNONE
constexpr bool shouldReportUnusedFuncArgs(CallingConv::ID CC) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Name should express the reason, not the usage context. Although here I don't understand why you're going out of your way to exclude kernels. The same reasoning should apply when using preloaded arguments

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you suggest a better name? This is mostly just an implementation detail. Maybe it shouldn't be in AMDGPUBaseInfo in the first place. Should I just move it to AMDGPUAsmPrinter.cpp?

Although here I don't understand why you're going out of your way to exclude kernels. The same reasoning should apply when using preloaded arguments

Graphics and kernels handle hardware-initialized registers a bit differently. For graphics, we're putting them as arguments to the IR functions, and for compute we track them in SIMachineFunctionInfo instead. We do handle the preloaded arguments in the same place in AMDGPUAsmPrinter, just on the else branch of where this helper is used.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you suggest a better name? This is mostly just an implementation detail. Maybe it shouldn't be in AMDGPUBaseInfo in the first place. Should I just move it to AMDGPUAsmPrinter.cpp?

Just handle all entry points. I don't see any sensible reason why this would exclude compute entry points.

Graphics and kernels handle hardware-initialized registers a bit differently. For graphics, we're putting them as arguments to the IR functions

I think you misunderstand. Compute know has a preloading kernel argument optimization, where the values appear in the IR kernel argument list exactly the same way as graphics. There is no fundamental difference here, it's programming the same registers even if that weren't the case.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I get that, but the current code already treats kernels differently and changing that would cause a lot of test churn that's not related to this patch. At the moment we don't include unused VGPR arguments for kernels and I'm trying to preserve that behavior.

switch (CC) {
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_LS:
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
return true;
default:
return false;
}
}

LLVM_READNONE
constexpr bool isChainCC(CallingConv::ID CC) {
switch (CC) {
Expand Down
Loading
Loading