Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4255,10 +4255,9 @@ same *vendor-name*.
wavefront for
GFX6-GFX9. A register
is required if it is
used explicitly, or
written to, or
if a higher numbered
register is used
explicitly. This
register is written to. This
includes the special
SGPRs for VCC, Flat
Scratch (GFX7-GFX9)
Expand All @@ -4276,10 +4275,10 @@ same *vendor-name*.
each work-item for
GFX6-GFX9. A register
is required if it is
used explicitly, or
written to, or
if a higher numbered
register is used
explicitly.
register is
written to.
".agpr_count" integer Required Number of accumulator
registers required by
each work-item for
Expand Down
14 changes: 4 additions & 10 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -990,7 +990,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// dispatch registers are function args.
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;

if (isShader(F.getCallingConv())) {
// Shaders that use the init.whole.wave intrinsic sometimes have VGPR
// arguments that are only added for the purpose of preserving their inactive
// lanes. Skip including them in the VGPR count.
if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
bool IsPixelShader =
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();

Expand Down Expand Up @@ -1061,15 +1064,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,

ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
} else if (isKernel(F.getCallingConv()) &&
MFI->getNumKernargPreloadedSGPRs()) {
// Consider cases where the total number of UserSGPRs with trailing
// allocated preload SGPRs, is greater than the number of explicitly
// referenced SGPRs.
const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
ProgInfo.NumSGPR =
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
}

// Adjust number of registers used to meet default/requested minimum/maximum
Expand Down
283 changes: 17 additions & 266 deletions llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,274 +137,29 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
if (MFI->isStackRealigned())
Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();

Info.UsesVCC =
MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);

// If there are no calls, MachineRegisterInfo can tell us the used register
// count easily.
// A tail call isn't considered a call for MachineFrameInfo's purposes.
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
if (ST.hasMAIInsts())
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC);

Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
Info.NumExplicitSGPR =
TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
if (ST.hasMAIInsts())
Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);

// Count any user or system SGPRs that are actually used.
for (int I = MFI->getNumPreloadedSGPRs() - 1; I >= 0; I--)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Braces, but also should just take the raw number of preloaded SGPRs to start with. What happens if you don't allocate preloaded registers that were requested? If we wanted to trim out unused preloaded registers, it should have happened earlier (or, we wouldn't have preloaded in the first place). This could also break the debugger use case if it expects to find something there

if (MRI.isPhysRegUsed(AMDGPU::SGPR0 + I)) {
Info.NumExplicitSGPR = std::max<int32_t>(
Info.NumExplicitSGPR, TRI.getHWRegIndex(AMDGPU::SGPR0 + I) + 1);
break;
}

if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
return Info;
}

int32_t MaxVGPR = -1;
int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
Info.CalleeSegmentSize = 0;

for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
// TODO: Check regmasks? Do they occur anywhere except calls?
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
bool IsAGPR = false;

if (!MO.isReg())
continue;

Register Reg = MO.getReg();
switch (Reg) {
case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
case AMDGPU::EXEC_HI:
case AMDGPU::SCC:
case AMDGPU::M0:
case AMDGPU::M0_LO16:
case AMDGPU::M0_HI16:
case AMDGPU::SRC_SHARED_BASE_LO:
case AMDGPU::SRC_SHARED_BASE:
case AMDGPU::SRC_SHARED_LIMIT_LO:
case AMDGPU::SRC_SHARED_LIMIT:
case AMDGPU::SRC_PRIVATE_BASE_LO:
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT_LO:
case AMDGPU::SRC_PRIVATE_LIMIT:
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
case AMDGPU::SGPR_NULL:
case AMDGPU::SGPR_NULL64:
case AMDGPU::MODE:
continue;

case AMDGPU::NoRegister:
assert(MI.isDebugInstr() &&
"Instruction uses invalid noreg register");
continue;

case AMDGPU::VCC:
case AMDGPU::VCC_LO:
case AMDGPU::VCC_HI:
case AMDGPU::VCC_LO_LO16:
case AMDGPU::VCC_LO_HI16:
case AMDGPU::VCC_HI_LO16:
case AMDGPU::VCC_HI_HI16:
Info.UsesVCC = true;
continue;

case AMDGPU::FLAT_SCR:
case AMDGPU::FLAT_SCR_LO:
case AMDGPU::FLAT_SCR_HI:
continue;

case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
llvm_unreachable("xnack_mask registers should not be used");

case AMDGPU::LDS_DIRECT:
llvm_unreachable("lds_direct register should not be used");

case AMDGPU::TBA:
case AMDGPU::TBA_LO:
case AMDGPU::TBA_HI:
case AMDGPU::TMA:
case AMDGPU::TMA_LO:
case AMDGPU::TMA_HI:
llvm_unreachable("trap handler registers should not be used");

case AMDGPU::SRC_VCCZ:
llvm_unreachable("src_vccz register should not be used");

case AMDGPU::SRC_EXECZ:
llvm_unreachable("src_execz register should not be used");

case AMDGPU::SRC_SCC:
llvm_unreachable("src_scc register should not be used");

default:
break;
}

if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
IsSGPR = true;
Width = 1;
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
AMDGPU::VGPR_16RegClass.contains(Reg)) {
IsSGPR = false;
Width = 1;
} else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 1;
} else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
IsSGPR = true;
Width = 2;
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
IsSGPR = false;
Width = 2;
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
Width = 3;
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
IsSGPR = true;
Width = 3;
} else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 3;
} else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
IsSGPR = true;
Width = 4;
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
IsSGPR = false;
Width = 4;
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 4;
} else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
IsSGPR = false;
Width = 5;
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
IsSGPR = true;
Width = 5;
} else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 5;
} else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
IsSGPR = false;
Width = 6;
} else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
IsSGPR = true;
Width = 6;
} else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 6;
} else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
IsSGPR = false;
Width = 7;
} else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
IsSGPR = true;
Width = 7;
} else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 7;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
IsSGPR = true;
Width = 8;
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
IsSGPR = false;
Width = 8;
} else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 8;
} else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
IsSGPR = false;
Width = 9;
} else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
IsSGPR = true;
Width = 9;
} else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 9;
} else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
IsSGPR = false;
Width = 10;
} else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
IsSGPR = true;
Width = 10;
} else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 10;
} else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
IsSGPR = false;
Width = 11;
} else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
IsSGPR = true;
Width = 11;
} else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 11;
} else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
IsSGPR = false;
Width = 12;
} else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
IsSGPR = true;
Width = 12;
} else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 12;
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
IsSGPR = true;
Width = 16;
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
IsSGPR = false;
Width = 16;
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 16;
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
IsSGPR = true;
Width = 32;
} else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
Width = 32;
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 32;
} else {
// We only expect TTMP registers or registers that do not belong to
// any RC.
assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
AMDGPU::TTMP_64RegClass.contains(Reg) ||
AMDGPU::TTMP_128RegClass.contains(Reg) ||
AMDGPU::TTMP_256RegClass.contains(Reg) ||
AMDGPU::TTMP_512RegClass.contains(Reg) ||
!TRI.getPhysRegBaseClass(Reg)) &&
"Unknown register class");
}
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
if (IsSGPR) {
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
} else if (IsAGPR) {
MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
} else {
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
}
}

if (MI.isCall()) {
// Pseudo used just to encode the underlying global. Is there a better
// way to track this?
Expand Down Expand Up @@ -464,9 +219,5 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
}
}

Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
Info.NumAGPR = MaxAGPR + 1;

return Info;
}
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4053,6 +4053,21 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
return 0;
}

unsigned
SIRegisterInfo::getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const {
auto isDefinedByImplicitDef = [](MachineOperand &Op) {
return Op.getParent()->isImplicitDef();
};

for (MCPhysReg Reg : reverse(RC.getRegisters()))
for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI)
if (!(MRI.def_empty(*AI) || std::all_of(MRI.def_begin(*AI), MRI.def_end(),
isDefinedByImplicitDef)))
return getHWRegIndex(Reg) + 1;
return 0;
}

SmallVector<StringLiteral>
SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
const MachineFunction &MF) const {
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const;

// \returns the number of registers of a given \p RC defined in a function.
// Does not go inside function calls.
unsigned getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
const TargetRegisterClass &RC) const;

std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
: std::optional<uint8_t>{};
Expand Down
Loading