Skip to content

Commit d1ccda1

Browse files
committed
[AMDGPU] Mark workitem IDs uniform in more cases
This fixes an old FIXME, where (workitem ID X) / (wavefrront size) would never be marked uniform if it was possible that there would be Y and Z dimensions. Now, so long as the required size of the X dimension is a power of 2, dividing that dimension by the wavefront size creates a uniform value. Furthermore, if the required launch size of the X dimension is a power of 2 that's at least the wavefront size, the Y and Z workitem IDs are now marked uniform.
1 parent 6d231fb commit d1ccda1

File tree

4 files changed

+71
-19
lines changed

4 files changed

+71
-19
lines changed

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -237,11 +237,13 @@ AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
237237
return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes);
238238
}
239239

240-
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
240+
std::optional<unsigned>
241+
AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel,
242+
unsigned Dim) const {
241243
auto *Node = Kernel.getMetadata("reqd_work_group_size");
242244
if (Node && Node->getNumOperands() == 3)
243245
return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
244-
return std::numeric_limits<unsigned>::max();
246+
return std::nullopt;
245247
}
246248

247249
bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
@@ -250,9 +252,9 @@ bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
250252

251253
unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
252254
unsigned Dimension) const {
253-
unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
254-
if (ReqdSize != std::numeric_limits<unsigned>::max())
255-
return ReqdSize - 1;
255+
std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
256+
if (ReqdSize.has_value())
257+
return ReqdSize.value() - 1;
256258
return getFlatWorkGroupSizes(Kernel).second - 1;
257259
}
258260

@@ -303,9 +305,9 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
303305
}
304306

305307
if (Dim <= 3) {
306-
unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
307-
if (ReqdSize != std::numeric_limits<unsigned>::max())
308-
MinSize = MaxSize = ReqdSize;
308+
std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
309+
if (ReqdSize.has_value())
310+
MinSize = MaxSize = *ReqdSize;
309311
}
310312
}
311313
}

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,12 @@ class AMDGPUSubtarget {
100100
/// be converted to integer, or violate subtarget's specifications.
101101
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
102102

103+
/// \returns The required size of workgroups that will be used to execute \p F
104+
/// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
105+
/// metadata. Otherwise, returns std::nullopt.
106+
std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
107+
unsigned Dim) const;
108+
103109
/// \returns Subtarget's default pair of minimum/maximum number of waves per
104110
/// execution unit for function \p F, or minimum/maximum number of waves per
105111
/// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "llvm/Analysis/LoopInfo.h"
2323
#include "llvm/Analysis/ValueTracking.h"
2424
#include "llvm/CodeGen/Analysis.h"
25+
#include "llvm/IR/Function.h"
2526
#include "llvm/IR/IRBuilder.h"
2627
#include "llvm/IR/IntrinsicsAMDGPU.h"
2728
#include "llvm/IR/PatternMatch.h"
@@ -1003,6 +1004,21 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
10031004
DstAS == AMDGPUAS::FLAT_ADDRESS &&
10041005
ST->hasGloballyAddressableScratch();
10051006
}
1007+
case Intrinsic::amdgcn_workitem_id_y:
1008+
case Intrinsic::amdgcn_workitem_id_z: {
1009+
// If the X dimension is guaranteed to launch with a size that is a power
1010+
// of 2
1011+
// >= the wavefront size, then the Y and Z dimensions are uniform.
1012+
// Similarly, if the dimension has size 1, it is also uniform.
1013+
const Function *F = Intrinsic->getFunction();
1014+
std::optional<unsigned> ReqdXDimSize = ST->getReqdWorkGroupSize(*F, 0);
1015+
if (ReqdXDimSize.has_value() && isPowerOf2_32(*ReqdXDimSize) &&
1016+
*ReqdXDimSize >= ST->getWavefrontSize())
1017+
return false;
1018+
std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1019+
*F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1020+
return !(ThisDimSize.has_value() && *ThisDimSize == 1);
1021+
}
10061022
default:
10071023
return AMDGPU::isIntrinsicSourceOfDivergence(IID);
10081024
}
@@ -1049,28 +1065,35 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
10491065
// packed into a same wave which gives 1 and 0 after the division by 64
10501066
// respectively.
10511067
//
1052-
// FIXME: limit it to 1D kernels only, although that shall be possible
1053-
// to perform this optimization is the size of the X dimension is a power
1054-
// of 2, we just do not currently have infrastructure to query it.
1068+
// The X dimension doesn't reset within a wave if either both the Y
1069+
// and Z dimensions are of length 1, or if the X dimension's required
1070+
// size is a power of 2. Note, however, if the X dimension's maximum
1071+
// size is a power of 2 < the wavefront size, division by the wavefront
1072+
// size is guaranteed to yield 0, so this is also a no-reset case.
1073+
bool XDimDoesntResetWithinWaves = false;
1074+
if (auto *I = dyn_cast<Instruction>(V)) {
1075+
const Function *F = I->getFunction();
1076+
std::optional<unsigned> ReqdXDimSize = ST->getReqdWorkGroupSize(*F, 0);
1077+
XDimDoesntResetWithinWaves =
1078+
ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
1079+
if (ReqdXDimSize.has_value() && isPowerOf2_32(*ReqdXDimSize))
1080+
XDimDoesntResetWithinWaves = true;
1081+
}
10551082
using namespace llvm::PatternMatch;
10561083
uint64_t C;
10571084
if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
10581085
m_ConstantInt(C))) ||
10591086
match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
10601087
m_ConstantInt(C)))) {
1061-
const Function *F = cast<Instruction>(V)->getFunction();
1062-
return C >= ST->getWavefrontSizeLog2() &&
1063-
ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
1088+
return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
10641089
}
10651090

10661091
Value *Mask;
10671092
if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
10681093
m_Value(Mask)))) {
1069-
const Function *F = cast<Instruction>(V)->getFunction();
1070-
const DataLayout &DL = F->getDataLayout();
10711094
return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
10721095
ST->getWavefrontSizeLog2() &&
1073-
ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
1096+
XDimDoesntResetWithinWaves;
10741097
}
10751098

10761099
const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);

llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,13 @@ entry:
9898
}
9999

100100
; GCN-LABEL: {{^}}lshr_threadid_3d:
101-
; GCN: global_load_dword
101+
; W64: global_load_dword
102+
; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
103+
; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
102104

103105
; OPT-LABEL: @lshr_threadid_3d
104-
; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
106+
; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
107+
; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
105108
define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
106109
entry:
107110
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -114,6 +117,24 @@ entry:
114117
ret void
115118
}
116119

120+
; GCN-LABEL: {{^}}high_id_uniform:
121+
; GCN: v_lshlrev_b32_e32 v0, 2, v2
122+
; GCN: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
123+
; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
124+
125+
; OPT-LABEL: @high_id_uniform
126+
; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext, !amdgpu.uniform
127+
define amdgpu_kernel void @high_id_uniform(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
128+
entry:
129+
%zid = tail call i32 @llvm.amdgcn.workitem.id.z()
130+
%zid.zext = zext nneg i32 %zid to i64
131+
%arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext
132+
%load = load i32, ptr addrspace(1) %arrayidx, align 4
133+
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %zid.zext
134+
store i32 %load, ptr addrspace(1) %arrayidx2, align 4
135+
ret void
136+
}
137+
117138
; GCN-LABEL: {{^}}lshr_threadid_1d_uneven:
118139
; W64: global_load_dword
119140
; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0

0 commit comments

Comments
 (0)