Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
uint64_t NumSGPRsForWavesPerEU = std::max(
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
STM.computeOccupancy(F, MFI.getLDSSize()),
STM.getOccupancyWithWorkGroupSizes(*MF).second,
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
OutContext);
Expand Down Expand Up @@ -1272,8 +1272,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}

ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);

const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1344,7 +1344,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
}

unsigned MaxOccupancy =
ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F);
ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second;

// Restrict local memory usage so that we don't drastically reduce occupancy,
// unless it is already significantly reduced.
Expand Down
125 changes: 81 additions & 44 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,55 +55,92 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
return getLocalMemorySize() / WorkGroupsPerCU;
}

// FIXME: Should return min,max range.
//
// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
// be achieved when only the given function is running on the machine; and
// taking into account the overall number of wave slots, the (maximum) workgroup
// size, and the per-workgroup LDS allocation size.
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
const Function &F) const {
const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
if (!MaxWorkGroupsPerCu)
return 0;

const unsigned WaveSize = getWavefrontSize();

// FIXME: Do we need to account for alignment requirement of LDS rounding the
// size up?
// Compute restriction based on LDS usage
unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);

// This can be queried with more LDS than is possible, so just assume the
// worst.
if (NumGroups == 0)
return 1;

NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);

// Round to the number of waves per CU.
const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
unsigned MaxWaves = NumGroups * MaxGroupNumWaves;

// Number of waves per EU (SIMD).
MaxWaves = divideCeil(MaxWaves, getEUsPerCU());

// Clamp to the maximum possible number of waves.
MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
std::pair<unsigned, unsigned>
AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
const Function &F) const {
// FIXME: Is there an allocation granularity for the LDS? If so we would need
// to make sure the amount of bytes is aligned on that granularity.

// Compute occupancy restriction based on LDS usage.
const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably should try to account for the "amdgpu-lds-size" on the function, but that's beyond the scope of this patch


// Queried LDS size may be larger than available on a CU, in which case we
// consider the only achievable occupancy to be 1, in line with what we
// consider the occupancy to be when the number of requested registers in a
// particular bank is higher than the number of available ones in that bank.
if (!MaxWGsLDS)
return {1, 1};

const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();

auto PropsFromWGSize = [&](unsigned WGSize)
-> std::tuple<const unsigned, const unsigned, unsigned> {
unsigned WavesPerWG = divideCeil(WGSize, WaveSize);
unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS);
return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
};

// The maximum group size will generally yield the minimum number of
// workgroups, maximum number of waves, and minimum occupancy. The opposite is
// generally true for the minimum group size. LDS or barrier ressource
// limitations can flip those minimums/maximums.
const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F);
auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);

// It is possible that we end up with flipped minimum and maximum number of
// waves per CU when the number of minimum/maximum concurrent groups on the CU
// is limited by LDS usage or barrier ressources.
if (MinWavesPerCU >= MaxWavesPerCU) {
std::swap(MinWavesPerCU, MaxWavesPerCU);
} else {
// Look for a potential smaller group size than the maximum which decreases
// the concurrent number of waves on the CU for the same number of
// concurrent workgroups on the CU.
unsigned MinWavesPerCUForWGSize =
divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU;
if (MinWavesPerCU > MinWavesPerCUForWGSize) {
unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
// There may exist a smaller group size than the maximum that achieves
// the minimum number of waves per CU. This group size is the largest
// possible size that requires MaxWavesPerWG - E waves where E is
// maximized under the following constraints.
// 1. 0 <= E <= ExcessSlotsPerWG
// 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize
MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG,
MaxWavesPerWG - MinWavesPerWG);
}
}

// FIXME: Needs to be a multiple of the group size?
//MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
// Look for a potential larger group size than the minimum which increases
// the concurrent number of waves on the CU for the same number of
// concurrent workgroups on the CU.
unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
// There may exist a larger group size than the minimum that achieves the
// maximum number of waves per CU. This group size is the smallest
// possible size that requires MinWavesPerWG + L waves where L is
// maximized under the following constraints.
// 1. 0 <= L <= LeftoverSlotsPerWG
// 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize
MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG,
((MaxWGSize - 1) / WaveSize) + 1 -
MinWavesPerWG);
}
}

assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
"computed invalid occupancy");
return MaxWaves;
// Return the minimum/maximum number of waves on any EU, assuming that all
// wavefronts are spread across all EUs as evenly as possible.
return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU),
std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)};
}

unsigned
AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
const MachineFunction &MF) const {
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction());
}

std::pair<unsigned, unsigned>
Expand Down
18 changes: 14 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,21 @@ class AMDGPUSubtarget {
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
const Function &) const;

/// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
/// the given LDS memory size is the only constraint.
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p F and each
/// workgroup running the function requires \p LDSBytes bytes of LDS space.
/// This notably depends on the range of allowed flat group sizes for the
/// function and hardware characteristics.
std::pair<unsigned, unsigned>
getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;

unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p MF. This notably
/// depends on the range of allowed flat group sizes for the function, the
/// amount of per-workgroup LDS space required by the function, and hardware
/// characteristics.
std::pair<unsigned, unsigned>
getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;

bool isAmdHsaOS() const {
return TargetTriple.getOS() == Triple::AMDHSA;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1721,7 +1721,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(

if (MFI->Occupancy == 0) {
// Fixup the subtarget dependent default value.
MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
}

auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
Expand Down
9 changes: 4 additions & 5 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1089,9 +1089,8 @@ bool PreRARematStage::initGCNSchedStage() {
return false;

const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
// Check maximum occupancy
if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
DAG.MinOccupancy)
// Rematerialization will not help if occupancy is not limited by reg usage.
if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy)
return false;

// FIXME: This pass will invalidate cached MBBLiveIns for regions
Expand Down Expand Up @@ -1272,8 +1271,8 @@ void GCNSchedStage::checkScheduling() {
return;
}

unsigned TargetOccupancy =
std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
unsigned TargetOccupancy = std::min(
S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
unsigned WavesAfter =
std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
unsigned WavesBefore =
Expand Down
20 changes: 10 additions & 10 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,16 +405,16 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
}

unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs,
unsigned NumVGPRs) const {
unsigned Occupancy =
std::min(getMaxWavesPerEU(), getOccupancyWithLocalMemSize(LDSSize, F));
if (NumSGPRs)
Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
if (NumVGPRs)
Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
return Occupancy;
std::pair<unsigned, unsigned>
GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs, unsigned NumVGPRs) const {
auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);

// Maximum occupancy may be further limited by high SGPR/VGPR usage.
MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
return {std::min(MinOcc, MaxOcc), MaxOcc};
}

unsigned GCNSubtarget::getBaseMaxNumSGPRs(
Expand Down
16 changes: 11 additions & 5 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1368,12 +1368,18 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// VGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;

/// Return occupancy for the given function. Used LDS and a number of
/// registers if provided.
/// Note, occupancy can be affected by the scratch allocation as well, but
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p F, each workgroup
/// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
/// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
/// range, so this returns a range as well.
///
/// Note that occupancy can be affected by the scratch allocation as well, but
/// we do not have enough information to compute it.
unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
unsigned LDSSize = 0,
unsigned NumSGPRs = 0,
unsigned NumVGPRs = 0) const;

/// \returns true if the flat_scratch register should be initialized with the
/// pointer to the wave's scratch memory rather than a size and offset.
Expand Down
5 changes: 2 additions & 3 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
assert(MaxNumWorkGroups.size() == 3);

Occupancy = ST.computeOccupancy(F, getLDSSize());
Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
CallingConv::ID CC = F.getCallingConv();

VRegFlags.reserve(1024);
Expand Down Expand Up @@ -185,8 +185,7 @@ MachineFunctionInfo *SIMachineFunctionInfo::clone(
void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
limitOccupancy(getMaxWavesPerEU());
const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
MF.getFunction()));
limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second);
}

Register SIMachineFunctionInfo::addPrivateSegmentBuffer(
Expand Down
9 changes: 3 additions & 6 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3642,18 +3642,15 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,

unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
MF.getFunction());
unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
switch (RC->getID()) {
default:
return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
case AMDGPU::VGPR_32RegClassID:
return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF));
case AMDGPU::SGPR_32RegClassID:
case AMDGPU::SGPR_LO16RegClassID:
return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
}
}

Expand Down
Loading
Loading