Skip to content

Commit 9470113

Browse files
krzysz00arsenm
andauthored
[AMDGPU] Mark workitem IDs uniform in more cases (llvm#152581)
This fixes an old FIXME, where (workitem ID X) / (wavefrront size) would never be marked uniform if it was possible that there would be Y and Z dimensions. Now, so long as the required size of the X dimension is a power of 2, dividing that dimension by the wavefront size creates a uniform value. Furthermore, if the required launch size of the X dimension is a power of 2 that's at least the wavefront size, the Y and Z workitem IDs are now marked uniform. --------- Co-authored-by: Matt Arsenault <[email protected]>
1 parent c0fc5be commit 9470113

File tree

5 files changed

+122
-19
lines changed

5 files changed

+122
-19
lines changed

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -229,11 +229,31 @@ AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
229229
return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes);
230230
}
231231

232-
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
232+
std::optional<unsigned>
233+
AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel,
234+
unsigned Dim) const {
233235
auto *Node = Kernel.getMetadata("reqd_work_group_size");
234236
if (Node && Node->getNumOperands() == 3)
235237
return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
236-
return std::numeric_limits<unsigned>::max();
238+
return std::nullopt;
239+
}
240+
241+
bool AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim(
242+
const Function &F, bool RequiresUniformYZ) const {
243+
auto *Node = F.getMetadata("reqd_work_group_size");
244+
if (!Node || Node->getNumOperands() != 3)
245+
return false;
246+
unsigned XLen =
247+
mdconst::extract<ConstantInt>(Node->getOperand(0))->getZExtValue();
248+
unsigned YLen =
249+
mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue();
250+
unsigned ZLen =
251+
mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue();
252+
253+
bool Is1D = YLen <= 1 && ZLen <= 1;
254+
bool IsXLargeEnough =
255+
isPowerOf2_32(XLen) && (!RequiresUniformYZ || XLen >= getWavefrontSize());
256+
return Is1D || IsXLargeEnough;
237257
}
238258

239259
bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
@@ -242,9 +262,9 @@ bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
242262

243263
unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
244264
unsigned Dimension) const {
245-
unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
246-
if (ReqdSize != std::numeric_limits<unsigned>::max())
247-
return ReqdSize - 1;
265+
std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
266+
if (ReqdSize)
267+
return *ReqdSize - 1;
248268
return getFlatWorkGroupSizes(Kernel).second - 1;
249269
}
250270

@@ -295,9 +315,9 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
295315
}
296316

297317
if (Dim <= 3) {
298-
unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
299-
if (ReqdSize != std::numeric_limits<unsigned>::max())
300-
MinSize = MaxSize = ReqdSize;
318+
std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
319+
if (ReqdSize)
320+
MinSize = MaxSize = *ReqdSize;
301321
}
302322
}
303323
}

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,26 @@ class AMDGPUSubtarget {
100100
/// be converted to integer, or violate subtarget's specifications.
101101
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
102102

103+
/// \returns The required size of workgroups that will be used to execute \p F
104+
/// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
105+
/// metadata. Otherwise, returns std::nullopt.
106+
std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
107+
unsigned Dim) const;
108+
109+
/// \returns true if \p F will execute in a manner that leaves the X
110+
/// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
111+
/// wavefrontsize is uniform. This is true if either the Y and Z block
112+
/// dimensions are known to always be 1 or if the X dimension will always be a
113+
/// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
114+
/// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
115+
/// wavesize64 would ordinarily pass this test, it won't with
116+
/// \pRequiresUniformYZ).
117+
///
118+
/// This information is currently only gathered from the !reqd_work_group_size
119+
/// metadata on \p F, but this may be improved in the future.
120+
bool hasWavefrontsEvenlySplittingXDim(const Function &F,
121+
bool REquiresUniformYZ = false) const;
122+
103123
/// \returns Subtarget's default pair of minimum/maximum number of waves per
104124
/// execution unit for function \p F, or minimum/maximum number of waves per
105125
/// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "llvm/Analysis/LoopInfo.h"
2323
#include "llvm/Analysis/ValueTracking.h"
2424
#include "llvm/CodeGen/Analysis.h"
25+
#include "llvm/IR/Function.h"
2526
#include "llvm/IR/IRBuilder.h"
2627
#include "llvm/IR/IntrinsicsAMDGPU.h"
2728
#include "llvm/IR/PatternMatch.h"
@@ -1003,6 +1004,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
10031004
DstAS == AMDGPUAS::FLAT_ADDRESS &&
10041005
ST->hasGloballyAddressableScratch();
10051006
}
1007+
case Intrinsic::amdgcn_workitem_id_y:
1008+
case Intrinsic::amdgcn_workitem_id_z: {
1009+
const Function *F = Intrinsic->getFunction();
1010+
bool HasUniformYZ =
1011+
ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1012+
std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1013+
*F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1014+
return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1015+
}
10061016
default:
10071017
return AMDGPU::isIntrinsicSourceOfDivergence(IID);
10081018
}
@@ -1049,28 +1059,31 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
10491059
// packed into a same wave which gives 1 and 0 after the division by 64
10501060
// respectively.
10511061
//
1052-
// FIXME: limit it to 1D kernels only, although that shall be possible
1053-
// to perform this optimization is the size of the X dimension is a power
1054-
// of 2, we just do not currently have infrastructure to query it.
1062+
// The X dimension doesn't reset within a wave if either both the Y
1063+
// and Z dimensions are of length 1, or if the X dimension's required
1064+
// size is a power of 2. Note, however, if the X dimension's maximum
1065+
// size is a power of 2 < the wavefront size, division by the wavefront
1066+
// size is guaranteed to yield 0, so this is also a no-reset case.
1067+
bool XDimDoesntResetWithinWaves = false;
1068+
if (auto *I = dyn_cast<Instruction>(V)) {
1069+
const Function *F = I->getFunction();
1070+
XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1071+
}
10551072
using namespace llvm::PatternMatch;
10561073
uint64_t C;
10571074
if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
10581075
m_ConstantInt(C))) ||
10591076
match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
10601077
m_ConstantInt(C)))) {
1061-
const Function *F = cast<Instruction>(V)->getFunction();
1062-
return C >= ST->getWavefrontSizeLog2() &&
1063-
ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
1078+
return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
10641079
}
10651080

10661081
Value *Mask;
10671082
if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
10681083
m_Value(Mask)))) {
1069-
const Function *F = cast<Instruction>(V)->getFunction();
1070-
const DataLayout &DL = F->getDataLayout();
10711084
return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
10721085
ST->getWavefrontSizeLog2() &&
1073-
ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
1086+
XDimDoesntResetWithinWaves;
10741087
}
10751088

10761089
const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);

llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,11 +113,40 @@ define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimz() !reqd_work_gr
113113
ret void
114114
}
115115

116+
; CHECK-LABEL: UniformityInfo for function 'workitem_id_z_uniform_len_1'
117+
; CHECK-NOT: DIVERGENT
118+
define amdgpu_kernel void @workitem_id_z_uniform_len_1(ptr %o) !reqd_work_group_size !4 {
119+
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
120+
store i32 %id.z, ptr %o
121+
ret void
122+
}
123+
124+
; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_div_wavefront_size'
125+
; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x()
126+
; CHECK-NOT: DIVERGENT
127+
define amdgpu_kernel void @workitem_id_x_div_wavefront_size(ptr %o) #3 !reqd_work_group_size !5 {
128+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
129+
%id.sg = lshr i32 %id.x, 6
130+
store i32 %id.sg, ptr %o
131+
ret void
132+
}
133+
134+
; CHECK-LABEL: UniformityInfo for function 'workitem_id_y_uniform_in_subgroup'
135+
; CHECK-NOT: DIVERGENT
136+
define amdgpu_kernel void @workitem_id_y_uniform_in_subgroup(ptr %o) #3 !reqd_work_group_size !5 {
137+
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
138+
store i32 %id.y, ptr %o
139+
ret void
140+
}
141+
116142
attributes #0 = { nounwind readnone }
117143
attributes #1 = { nounwind }
118144
attributes #2 = { "amdgpu-flat-work-group-size"="1,1" }
145+
attributes #3 = { "target-cpu"="gfx900" "amdgpu-flat-work-group-size"="256,256" }
119146

120147
!0 = !{i32 1, i32 1, i32 1}
121148
!1 = !{i32 2, i32 1, i32 1}
122149
!2 = !{i32 1, i32 2, i32 1}
123150
!3 = !{i32 1, i32 1, i32 2}
151+
!4 = !{i32 64, i32 1, i32 1}
152+
!5 = !{i32 128, i32 2, i32 1}

llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,13 @@ entry:
9898
}
9999

100100
; GCN-LABEL: {{^}}lshr_threadid_3d:
101-
; GCN: global_load_dword
101+
; W64: global_load_dword
102+
; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
103+
; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
102104

103105
; OPT-LABEL: @lshr_threadid_3d
104-
; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
106+
; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
107+
; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
105108
define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
106109
entry:
107110
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -114,6 +117,24 @@ entry:
114117
ret void
115118
}
116119

120+
; GCN-LABEL: {{^}}high_id_uniform:
121+
; GCN: v_lshlrev_b32_e32 v0, 2, v2
122+
; GCN: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
123+
; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
124+
125+
; OPT-LABEL: @high_id_uniform
126+
; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext, !amdgpu.uniform
127+
define amdgpu_kernel void @high_id_uniform(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
128+
entry:
129+
%zid = tail call i32 @llvm.amdgcn.workitem.id.z()
130+
%zid.zext = zext nneg i32 %zid to i64
131+
%arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext
132+
%load = load i32, ptr addrspace(1) %arrayidx, align 4
133+
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %zid.zext
134+
store i32 %load, ptr addrspace(1) %arrayidx2, align 4
135+
ret void
136+
}
137+
117138
; GCN-LABEL: {{^}}lshr_threadid_1d_uneven:
118139
; W64: global_load_dword
119140
; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0

0 commit comments

Comments
 (0)