diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index d095fc6cf9549..8b44a4ee948ad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -237,11 +237,31 @@ AMDGPUSubtarget::getWavesPerEU(std::pair FlatWorkGroupSizes, return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes); } -static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { +std::optional +AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel, + unsigned Dim) const { auto *Node = Kernel.getMetadata("reqd_work_group_size"); if (Node && Node->getNumOperands() == 3) return mdconst::extract(Node->getOperand(Dim))->getZExtValue(); - return std::numeric_limits::max(); + return std::nullopt; +} + +bool AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim( + const Function &F, bool RequiresUniformYZ) const { + auto *Node = F.getMetadata("reqd_work_group_size"); + if (!Node || Node->getNumOperands() != 3) + return false; + unsigned XLen = + mdconst::extract(Node->getOperand(0))->getZExtValue(); + unsigned YLen = + mdconst::extract(Node->getOperand(1))->getZExtValue(); + unsigned ZLen = + mdconst::extract(Node->getOperand(2))->getZExtValue(); + + bool Is1D = YLen <= 1 && ZLen <= 1; + bool IsXLargeEnough = + isPowerOf2_32(XLen) && (!RequiresUniformYZ || XLen >= getWavefrontSize()); + return Is1D || IsXLargeEnough; } bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { @@ -250,9 +270,9 @@ bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const { - unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); - if (ReqdSize != std::numeric_limits::max()) - return ReqdSize - 1; + std::optional ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); + if (ReqdSize) + return *ReqdSize - 1; return getFlatWorkGroupSizes(Kernel).second - 1; } @@ -303,9 +323,9 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { } if (Dim <= 3) { - unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); - if (ReqdSize != std::numeric_limits::max()) - MinSize = MaxSize = ReqdSize; + std::optional ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); + if (ReqdSize) + MinSize = MaxSize = *ReqdSize; } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 6878744496cfe..57b757c990e1a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -100,6 +100,26 @@ class AMDGPUSubtarget { /// be converted to integer, or violate subtarget's specifications. std::pair getFlatWorkGroupSizes(const Function &F) const; + /// \returns The required size of workgroups that will be used to execute \p F + /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size` + /// metadata. Otherwise, returns std::nullopt. + std::optional getReqdWorkGroupSize(const Function &F, + unsigned Dim) const; + + /// \returns true if \p F will execute in a manner that leaves the X + /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X / + /// wavefrontsize is uniform. This is true if either the Y and Z block + /// dimensions are known to always be 1 or if the X dimension will always be a + /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and + /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with + /// wavesize64 would ordinarily pass this test, it won't with + /// \pRequiresUniformYZ). + /// + /// This information is currently only gathered from the !reqd_work_group_size + /// metadata on \p F, but this may be improved in the future. + bool hasWavefrontsEvenlySplittingXDim(const Function &F, + bool REquiresUniformYZ = false) const; + /// \returns Subtarget's default pair of minimum/maximum number of waves per /// execution unit for function \p F, or minimum/maximum number of waves per /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 846a0b6280f19..3e2b2c3510569 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" @@ -1003,6 +1004,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { DstAS == AMDGPUAS::FLAT_ADDRESS && ST->hasGloballyAddressableScratch(); } + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: { + const Function *F = Intrinsic->getFunction(); + bool HasUniformYZ = + ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true); + std::optional ThisDimSize = ST->getReqdWorkGroupSize( + *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2); + return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1); + } default: return AMDGPU::isIntrinsicSourceOfDivergence(IID); } @@ -1049,28 +1059,31 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { // packed into a same wave which gives 1 and 0 after the division by 64 // respectively. // - // FIXME: limit it to 1D kernels only, although that shall be possible - // to perform this optimization is the size of the X dimension is a power - // of 2, we just do not currently have infrastructure to query it. + // The X dimension doesn't reset within a wave if either both the Y + // and Z dimensions are of length 1, or if the X dimension's required + // size is a power of 2. Note, however, if the X dimension's maximum + // size is a power of 2 < the wavefront size, division by the wavefront + // size is guaranteed to yield 0, so this is also a no-reset case. + bool XDimDoesntResetWithinWaves = false; + if (auto *I = dyn_cast(V)) { + const Function *F = I->getFunction(); + XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F); + } using namespace llvm::PatternMatch; uint64_t C; if (match(V, m_LShr(m_Intrinsic(), m_ConstantInt(C))) || match(V, m_AShr(m_Intrinsic(), m_ConstantInt(C)))) { - const Function *F = cast(V)->getFunction(); - return C >= ST->getWavefrontSizeLog2() && - ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; + return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves; } Value *Mask; if (match(V, m_c_And(m_Intrinsic(), m_Value(Mask)))) { - const Function *F = cast(V)->getFunction(); - const DataLayout &DL = F->getDataLayout(); return computeKnownBits(Mask, DL).countMinTrailingZeros() >= ST->getWavefrontSizeLog2() && - ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0; + XDimDoesntResetWithinWaves; } const ExtractValueInst *ExtValue = dyn_cast(V); diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll index 7466c2396e6f1..f5668cef5d63e 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll @@ -113,11 +113,40 @@ define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimz() !reqd_work_gr ret void } +; CHECK-LABEL: UniformityInfo for function 'workitem_id_z_uniform_len_1' +; CHECK-NOT: DIVERGENT +define amdgpu_kernel void @workitem_id_z_uniform_len_1(ptr %o) !reqd_work_group_size !4 { + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store i32 %id.z, ptr %o + ret void +} + +; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_div_wavefront_size' +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NOT: DIVERGENT +define amdgpu_kernel void @workitem_id_x_div_wavefront_size(ptr %o) #3 !reqd_work_group_size !5 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %id.sg = lshr i32 %id.x, 6 + store i32 %id.sg, ptr %o + ret void +} + +; CHECK-LABEL: UniformityInfo for function 'workitem_id_y_uniform_in_subgroup' +; CHECK-NOT: DIVERGENT +define amdgpu_kernel void @workitem_id_y_uniform_in_subgroup(ptr %o) #3 !reqd_work_group_size !5 { + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + store i32 %id.y, ptr %o + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { "amdgpu-flat-work-group-size"="1,1" } +attributes #3 = { "target-cpu"="gfx900" "amdgpu-flat-work-group-size"="256,256" } !0 = !{i32 1, i32 1, i32 1} !1 = !{i32 2, i32 1, i32 1} !2 = !{i32 1, i32 2, i32 1} !3 = !{i32 1, i32 1, i32 2} +!4 = !{i32 64, i32 1, i32 1} +!5 = !{i32 128, i32 2, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll index 90891cb28beed..f54e0019514f7 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll @@ -98,10 +98,13 @@ entry: } ; GCN-LABEL: {{^}}lshr_threadid_3d: -; GCN: global_load_dword +; W64: global_load_dword +; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0 +; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]] ; OPT-LABEL: @lshr_threadid_3d -; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}} +; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}} +; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 { entry: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -114,6 +117,24 @@ entry: ret void } +; GCN-LABEL: {{^}}high_id_uniform: +; GCN: v_lshlrev_b32_e32 v0, 2, v2 +; GCN: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0 +; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]] + +; OPT-LABEL: @high_id_uniform +; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext, !amdgpu.uniform +define amdgpu_kernel void @high_id_uniform(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 { +entry: + %zid = tail call i32 @llvm.amdgcn.workitem.id.z() + %zid.zext = zext nneg i32 %zid to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext + %load = load i32, ptr addrspace(1) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %zid.zext + store i32 %load, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + ; GCN-LABEL: {{^}}lshr_threadid_1d_uneven: ; W64: global_load_dword ; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0