Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 1 addition & 13 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1175,22 +1175,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// Make clamp modifier on NaN input returns 0.
ProgInfo.DX10Clamp = Mode.DX10Clamp;

unsigned LDSAlignShift;
if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
// LDS is allocated in 320 dword blocks.
LDSAlignShift = 11;
} else if (STM.getFeatureBits().test(
FeatureAddressableLocalMemorySize65536)) {
// LDS is allocated in 128 dword blocks.
LDSAlignShift = 9;
} else {
// LDS is allocated in 64 dword blocks.
LDSAlignShift = 8;
}

ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();

unsigned LDSAlignShift = Log2_32_Ceil(STM.getLDSAllocGranularity());
ProgInfo.LDSSize = MFI->getLDSSize();
ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1344,7 +1344,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
}

unsigned MaxOccupancy =
ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second;
ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F, TM).second;

// Restrict local memory usage so that we don't drastically reduce occupancy,
// unless it is already significantly reduced.
Expand Down
24 changes: 14 additions & 10 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,15 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
return getLocalMemorySize() / WorkGroupsPerCU;
}

std::pair<unsigned, unsigned>
AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
const Function &F) const {
// FIXME: Is there an allocation granularity for the LDS? If so we would need
// to make sure the amount of bytes is aligned on that granularity.

std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
uint32_t LDSBytes, const Function &F, const TargetMachine &TM) const {
// Compute occupancy restriction based on LDS usage.
if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would leave the allocation granularity for a follow up. There always was an allocation granularity for r600, but that's not handled here.

Also just move this whole thing into GCNSubtarget, don't do the triple check and downcast

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reverted all changes related to allocation granularity, and will properly address this for all subtargets in a different PR (probably through a new protected member variable on AMDGPUSubtarget). I reintroduced the FIXME in the function as well.

// For GCN subtargets, LDS size must be aligned on allocation granularity.
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
LDSBytes = alignTo(LDSBytes, ST.getLDSAllocGranularity());
}

const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably should try to account for the "amdgpu-lds-size" on the function, but that's beyond the scope of this patch


// Queried LDS size may be larger than available on a CU, in which case we
Expand All @@ -72,9 +74,8 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
return {1, 1};

const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();

auto PropsFromWGSize = [&](unsigned WGSize)
auto PropsFromWGSize = [=](unsigned WGSize)
-> std::tuple<const unsigned, const unsigned, unsigned> {
unsigned WavesPerWG = divideCeil(WGSize, WaveSize);
unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS);
Expand All @@ -91,10 +92,12 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,

// It is possible that we end up with flipped minimum and maximum number of
// waves per CU when the number of minimum/maximum concurrent groups on the CU
// is limited by LDS usage or barrier ressources.
// is limited by LDS usage or barrier resources.
if (MinWavesPerCU >= MaxWavesPerCU) {
std::swap(MinWavesPerCU, MaxWavesPerCU);
} else {
const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();

// Look for a potential smaller group size than the maximum which decreases
// the concurrent number of waves on the CU for the same number of
// concurrent workgroups on the CU.
Expand Down Expand Up @@ -140,7 +143,8 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
const MachineFunction &MF) const {
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction());
return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction(),
MF.getTarget());
}

std::pair<unsigned, unsigned>
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ class AMDGPUSubtarget {
/// This notably depends on the range of allowed flat group sizes for the
/// function and hardware characteristics.
std::pair<unsigned, unsigned>
getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;
getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F,
const TargetMachine &TM) const;

/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p MF. This notably
Expand Down
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,8 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
std::pair<unsigned, unsigned>
GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs, unsigned NumVGPRs) const {
auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
auto [MinOcc, MaxOcc] =
getOccupancyWithWorkGroupSizes(LDSSize, F, TLInfo.getTargetMachine());
unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);

Expand All @@ -417,6 +418,14 @@ GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
return {std::min(MinOcc, MaxOcc), MaxOcc};
}

unsigned GCNSubtarget::getLDSAllocGranularity() const {
if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize163840))
return 1280; // LDS is allocated in 320 dword blocks.
if (getFeatureBits().test(AMDGPU::FeatureAddressableLocalMemorySize65536))
return 512; // LDS is allocated in 128 dword blocks.
return 256; // LDS is allocated in 64 dword blocks.
}

unsigned GCNSubtarget::getBaseMaxNumSGPRs(
const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1381,6 +1381,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
unsigned NumSGPRs = 0,
unsigned NumVGPRs = 0) const;

/// Returns the LDS's allocation granularity in bytes.
unsigned getLDSAllocGranularity() const;

/// \returns true if the flat_scratch register should be initialized with the
/// pointer to the wave's scratch memory rather than a size and offset.
bool flatScratchIsPointer() const {
Expand Down
73 changes: 43 additions & 30 deletions llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,10 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 {
ret void
}

define void @v32_asm_def_use(float %v0, float %v1) #0 {
; FIXME: This case is broken. The asm value passed in v32 is live
; through the range where the reserved def for the copy is introduced,
; clobbering the user value.
define void @v32_asm_def_use(float %v0, float %v1) #4 {
; GFX908-LABEL: v32_asm_def_use:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
Expand All @@ -374,48 +377,57 @@ define void @v32_asm_def_use(float %v0, float %v1) #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def v[0:31] a[0:15]
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_accvgpr_read_b32 v32, a15
; GFX908-NEXT: v_accvgpr_read_b32 v35, a15
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def v32
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a31, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a14
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_write_b32 a31, v32
; GFX908-NEXT: v_accvgpr_read_b32 v32, a13
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a30, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a13
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a29, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a28, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a11
; GFX908-NEXT: v_accvgpr_write_b32 a29, v32
; GFX908-NEXT: v_accvgpr_read_b32 v32, a12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a27, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a10
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a26, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a9
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a25, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a8
; GFX908-NEXT: v_accvgpr_write_b32 a28, v32
; GFX908-NEXT: v_accvgpr_read_b32 v32, a10
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a24, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a23, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a6
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a22, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a5
; GFX908-NEXT: v_accvgpr_write_b32 a26, v32
; GFX908-NEXT: v_accvgpr_read_b32 v32, a9
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a21, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a2
; GFX908-NEXT: v_accvgpr_write_b32 a25, v32
; GFX908-NEXT: v_accvgpr_read_b32 v32, a7
; GFX908-NEXT: v_accvgpr_write_b32 a18, v35
; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_write_b32 a23, v32
; GFX908-NEXT: v_accvgpr_read_b32 v32, a6
; GFX908-NEXT: v_accvgpr_read_b32 v35, a4
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a22, v32
; GFX908-NEXT: v_accvgpr_read_b32 v32, a4
; GFX908-NEXT: v_accvgpr_write_b32 a20, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a3
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a20, v32
; GFX908-NEXT: v_accvgpr_read_b32 v32, a3
; GFX908-NEXT: v_accvgpr_write_b32 a19, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a2
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a19, v32
; GFX908-NEXT: v_accvgpr_read_b32 v32, a1
; GFX908-NEXT: v_accvgpr_write_b32 a18, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a1
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a17, v32
; GFX908-NEXT: v_accvgpr_read_b32 v32, a0
; GFX908-NEXT: v_accvgpr_write_b32 a17, v35
; GFX908-NEXT: v_accvgpr_read_b32 v35, a0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_accvgpr_write_b32 a16, v32
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def v32
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_accvgpr_write_b32 a16, v35
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -1133,3 +1145,4 @@ attributes #0 = { "amdgpu-waves-per-eu"="6,6" }
attributes #1 = { convergent nounwind readnone willreturn }
attributes #2 = { nounwind readnone willreturn }
attributes #3 = { "amdgpu-waves-per-eu"="7,7" }
attributes #4 = { "amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="1024,1024" }
Loading