Skip to content

Commit 636c606

Browse files
committed
Refactor uniform Y/Z helper, add tests
1 parent edfe5df commit 636c606

File tree

4 files changed

+65
-14
lines changed

4 files changed

+65
-14
lines changed

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,24 @@ AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel,
246246
return std::nullopt;
247247
}
248248

249+
bool AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim(
250+
const Function &F, bool RequiresUniformYZ) const {
251+
auto *Node = F.getMetadata("reqd_work_group_size");
252+
if (!Node || Node->getNumOperands() != 3)
253+
return false;
254+
unsigned XLen =
255+
mdconst::extract<ConstantInt>(Node->getOperand(0))->getZExtValue();
256+
unsigned YLen =
257+
mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue();
258+
unsigned ZLen =
259+
mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue();
260+
261+
bool Is1D = YLen <= 1 && ZLen <= 1;
262+
bool IsXLargeEnough =
263+
isPowerOf2_32(XLen) && (!RequiresUniformYZ || XLen >= getWavefrontSize());
264+
return Is1D || IsXLargeEnough;
265+
}
266+
249267
bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
250268
return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
251269
}

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,20 @@ class AMDGPUSubtarget {
106106
std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
107107
unsigned Dim) const;
108108

109+
/// \returns true if \p F will execute in a manner that leaves the X
110+
/// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
111+
/// wavefrontsize is uniform. This is true if either the Y and Z block
112+
/// dimensions are known to always be 1 or if the X dimension will always be a
113+
/// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
114+
/// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
115+
/// wavesize64 would ordinarily pass this test, it won't with
116+
/// \pRequiresUniformYZ).
117+
///
118+
/// This information is currently only gathered from the !reqd_work_group_size
119+
/// metadata on \p F, but this may be improved in the future.
120+
bool hasWavefrontsEvenlySplittingXDim(const Function &F,
121+
bool REquiresUniformYZ = false) const;
122+
109123
/// \returns Subtarget's default pair of minimum/maximum number of waves per
110124
/// execution unit for function \p F, or minimum/maximum number of waves per
111125
/// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,18 +1006,12 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
10061006
}
10071007
case Intrinsic::amdgcn_workitem_id_y:
10081008
case Intrinsic::amdgcn_workitem_id_z: {
1009-
// If the X dimension is guaranteed to launch with a size that is a power
1010-
// of 2
1011-
// >= the wavefront size, then the Y and Z dimensions are uniform.
1012-
// Similarly, if the dimension has size 1, it is also uniform.
10131009
const Function *F = Intrinsic->getFunction();
1014-
std::optional<unsigned> ReqdXDimSize = ST->getReqdWorkGroupSize(*F, 0);
1015-
if (ReqdXDimSize && isPowerOf2_32(*ReqdXDimSize) &&
1016-
*ReqdXDimSize >= ST->getWavefrontSize())
1017-
return false;
1010+
bool HasUniformYZ =
1011+
ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
10181012
std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
10191013
*F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1020-
return !ThisDimSize || *ThisDimSize != 1;
1014+
return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
10211015
}
10221016
default:
10231017
return AMDGPU::isIntrinsicSourceOfDivergence(IID);
@@ -1073,11 +1067,7 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
10731067
bool XDimDoesntResetWithinWaves = false;
10741068
if (auto *I = dyn_cast<Instruction>(V)) {
10751069
const Function *F = I->getFunction();
1076-
std::optional<unsigned> ReqdXDimSize = ST->getReqdWorkGroupSize(*F, 0);
1077-
XDimDoesntResetWithinWaves =
1078-
ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
1079-
if (ReqdXDimSize.has_value() && isPowerOf2_32(*ReqdXDimSize))
1080-
XDimDoesntResetWithinWaves = true;
1070+
XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
10811071
}
10821072
using namespace llvm::PatternMatch;
10831073
uint64_t C;

llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,11 +113,40 @@ define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimz() !reqd_work_gr
113113
ret void
114114
}
115115

116+
; CHECK-LABEL: UniformityInfo for function 'workitem_id_z_uniform_len_1'
117+
; CHECK-NOT: DIVERGENT
118+
define amdgpu_kernel void @workitem_id_z_uniform_len_1(ptr %o) !reqd_work_group_size !4 {
119+
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
120+
store i32 %id.z, ptr %o
121+
ret void
122+
}
123+
124+
; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_div_wavefront_size'
125+
; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x()
126+
; CHECK-NOT: DIVERGENT
127+
define amdgpu_kernel void @workitem_id_x_div_wavefront_size(ptr %o) #3 !reqd_work_group_size !5 {
128+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
129+
%id.sg = lshr i32 %id.x, 6
130+
store i32 %id.sg, ptr %o
131+
ret void
132+
}
133+
134+
; CHECK-LABEL: UniformityInfo for function 'workitem_id_y_uniform_in_subgroup'
135+
; CHECK-NOT: DIVERGENT
136+
define amdgpu_kernel void @workitem_id_y_uniform_in_subgroup(ptr %o) #3 !reqd_work_group_size !5 {
137+
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
138+
store i32 %id.y, ptr %o
139+
ret void
140+
}
141+
116142
attributes #0 = { nounwind readnone }
117143
attributes #1 = { nounwind }
118144
attributes #2 = { "amdgpu-flat-work-group-size"="1,1" }
145+
attributes #3 = { "target-cpu"="gfx900" "amdgpu-flat-work-group-size"="256,256" }
119146

120147
!0 = !{i32 1, i32 1, i32 1}
121148
!1 = !{i32 2, i32 1, i32 1}
122149
!2 = !{i32 1, i32 2, i32 1}
123150
!3 = !{i32 1, i32 1, i32 2}
151+
!4 = !{i32 64, i32 1, i32 1}
152+
!5 = !{i32 128, i32 2, i32 1}

0 commit comments

Comments
 (0)