diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 7ec2ee06b811a..971b05e1bd161 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -18,6 +18,7 @@ #include "AMDGPUTargetTransformInfo.h" #include "GCNSubtarget.h" #include "llvm/ADT/FloatingPointMode.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include @@ -481,6 +482,98 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC, return false; } +static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old, + Function &NewCallee, ArrayRef Ops) { + SmallVector OpBundles; + Old.getOperandBundlesAsDefs(OpBundles); + + CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles); + NewCall->takeName(&Old); + return NewCall; +} + +Instruction * +GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC, + IntrinsicInst &II) const { + const auto IID = II.getIntrinsicID(); + assert(IID == Intrinsic::amdgcn_readlane || + IID == Intrinsic::amdgcn_readfirstlane || + IID == Intrinsic::amdgcn_permlane64); + + Instruction *OpInst = dyn_cast(II.getOperand(0)); + + // Only do this if both instructions are in the same block + // (so the exec mask won't change) and the readlane is the only user of its + // operand. + if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent()) + return nullptr; + + const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane); + + // If this is a readlane, check that the second operand is a constant, or is + // defined before OpInst so we know it's safe to move this intrinsic higher. + Value *LaneID = nullptr; + if (IsReadLane) { + LaneID = II.getOperand(1); + + // readlane take an extra operand for the lane ID, so we must check if that + // LaneID value can be used at the point where we want to move the + // intrinsic. + if (auto *LaneIDInst = dyn_cast(LaneID)) { + if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst)) + return nullptr; + } + } + + // Hoist the intrinsic (II) through OpInst. + // + // (II (OpInst x)) -> (OpInst (II x)) + const auto DoIt = [&](unsigned OpIdx, + Function *NewIntrinsic) -> Instruction * { + SmallVector Ops{OpInst->getOperand(OpIdx)}; + if (IsReadLane) + Ops.push_back(LaneID); + + // Rewrite the intrinsic call. + CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops); + + // Rewrite OpInst so it takes the result of the intrinsic now. + Instruction &NewOp = *OpInst->clone(); + NewOp.setOperand(OpIdx, NewII); + return &NewOp; + }; + + // TODO(?): Should we do more with permlane64? + if (IID == Intrinsic::amdgcn_permlane64 && !isa(OpInst)) + return nullptr; + + if (isa(OpInst)) + return DoIt(0, II.getCalledFunction()); + + if (isa(OpInst)) { + Value *Src = OpInst->getOperand(0); + Type *SrcTy = Src->getType(); + if (!isTypeLegal(SrcTy)) + return nullptr; + + Function *Remangled = + Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy}); + return DoIt(0, Remangled); + } + + // We can also hoist through binary operators if the other operand is uniform. + if (isa(OpInst)) { + // FIXME: If we had access to UniformityInfo here we could just check + // if the operand is uniform. + if (isTriviallyUniform(OpInst->getOperandUse(0))) + return DoIt(1, II.getCalledFunction()); + if (isTriviallyUniform(OpInst->getOperandUse(1))) + return DoIt(0, II.getCalledFunction()); + } + + return nullptr; +} + std::optional GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -1173,31 +1266,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { simplifyDemandedLaneMaskArg(IC, II, 1)) return &II; - // readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1) - if (auto *BC = dyn_cast(Src); - BC && BC->hasOneUse() && IID != Intrinsic::amdgcn_ds_bpermute) { - Value *BCSrc = BC->getOperand(0); - - // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants. - if (isTypeLegal(BCSrc->getType())) { - Module *M = IC.Builder.GetInsertBlock()->getModule(); - Function *Remangled = - Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()}); - - // Make sure convergence tokens are preserved. - // TODO: CreateIntrinsic should allow directly copying bundles - SmallVector OpBundles; - II.getOperandBundlesAsDefs(OpBundles); - - SmallVector Args(II.args()); - Args[0] = BCSrc; - - CallInst *NewCall = IC.Builder.CreateCall(Remangled, Args, OpBundles); - NewCall->takeName(&II); - return new BitCastInst(NewCall, II.getType()); - } - } - // If the lane argument of bpermute is uniform, change it to readlane. This // generates better code and can enable further optimizations because // readlane is AlwaysUniform. @@ -1214,6 +1282,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } } + if (IID != Intrinsic::amdgcn_ds_bpermute) { + if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II)) + return Res; + } + return std::nullopt; } case Intrinsic::amdgcn_writelane: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index f6f7bd4bfcf5b..e00720dfa1eb7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -236,6 +236,9 @@ class GCNTTIImpl final : public BasicTTIImplBase { const APInt &DemandedElts, APInt &UndefElts) const; + Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC, + IntrinsicInst &II) const; + std::optional simplifyDemandedVectorEltsIntrinsic( InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readfirstlane.ll new file mode 100644 index 0000000000000..60561459e3f11 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -0,0 +1,675 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 -passes=instcombine -S < %s | FileCheck %s + +; test unary + +define float @hoist_fneg_f32(float %arg) { +; CHECK-LABEL: define float @hoist_fneg_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fneg float [[TMP0]] +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fneg float %arg + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define double @hoist_fneg_f64(double %arg) { +; CHECK-LABEL: define double @hoist_fneg_f64( +; CHECK-SAME: double [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fneg double [[TMP0]] +; CHECK-NEXT: ret double [[RFL]] +; +bb: + %val = fneg double %arg + %rfl = call double @llvm.amdgcn.readfirstlane.f64(double %val) + ret double %rfl +} + +; test casts + +define i32 @hoist_trunc(i64 %arg) { +; CHECK-LABEL: define i32 @hoist_trunc( +; CHECK-SAME: i64 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[RFL]] to i32 +; CHECK-NEXT: ret i32 [[TMP0]] +; +bb: + %val = trunc i64 %arg to i32 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i64 @hoist_zext(i32 %arg) { +; CHECK-LABEL: define i64 @hoist_zext( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[RFL]] to i64 +; CHECK-NEXT: ret i64 [[TMP0]] +; +bb: + %val = zext i32 %arg to i64 + %rfl = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %val) + ret i64 %rfl +} + +define i64 @hoist_sext(i32 %arg) { +; CHECK-LABEL: define i64 @hoist_sext( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[RFL]] to i64 +; CHECK-NEXT: ret i64 [[TMP0]] +; +bb: + %val = zext i32 %arg to i64 + %rfl = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %val) + ret i64 %rfl +} + +define i32 @hoist_fptoui(float %arg) { +; CHECK-LABEL: define i32 @hoist_fptoui( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = fptoui float [[RFL]] to i32 +; CHECK-NEXT: ret i32 [[TMP0]] +; +bb: + %val = fptoui float %arg to i32 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_fptosi(float %arg) { +; CHECK-LABEL: define i32 @hoist_fptosi( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = fptosi float [[RFL]] to i32 +; CHECK-NEXT: ret i32 [[TMP0]] +; +bb: + %val = fptosi float %arg to i32 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_uitofp(i32 %arg) { +; CHECK-LABEL: define float @hoist_uitofp( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = uitofp i32 [[RFL]] to float +; CHECK-NEXT: ret float [[TMP0]] +; +bb: + %val = uitofp i32 %arg to float + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define float @hoist_sitofp(i32 %arg) { +; CHECK-LABEL: define float @hoist_sitofp( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = sitofp i32 [[RFL]] to float +; CHECK-NEXT: ret float [[TMP0]] +; +bb: + %val = sitofp i32 %arg to float + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define float @hoist_fptrunc(double %arg) { +; CHECK-LABEL: define float @hoist_fptrunc( +; CHECK-SAME: double [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = fptrunc double [[RFL]] to float +; CHECK-NEXT: ret float [[TMP0]] +; +bb: + %val = fptrunc double %arg to float + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define float @hoist_fpext(half %arg) { +; CHECK-LABEL: define float @hoist_fpext( +; CHECK-SAME: half [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = fpext half [[RFL]] to float +; CHECK-NEXT: ret float [[TMP0]] +; +bb: + %val = fpext half %arg to float + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i64 @hoist_ptrtoint(ptr %arg) { +; CHECK-LABEL: define i64 @hoist_ptrtoint( +; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[RFL]] to i64 +; CHECK-NEXT: ret i64 [[TMP0]] +; +bb: + %val = ptrtoint ptr %arg to i64 + %rfl = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %val) + ret i64 %rfl +} + +define ptr @hoist_inttoptr(i64 %arg) { +; CHECK-LABEL: define ptr @hoist_inttoptr( +; CHECK-SAME: i64 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = inttoptr i64 [[RFL]] to ptr +; CHECK-NEXT: ret ptr [[TMP0]] +; +bb: + %val = inttoptr i64 %arg to ptr + %rfl = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %val) + ret ptr %rfl +} + +define float @hoist_bitcast(i32 %arg) { +; CHECK-LABEL: define float @hoist_bitcast( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[RFL]] to float +; CHECK-NEXT: ret float [[TMP0]] +; +bb: + %val = bitcast i32 %arg to float + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define ptr addrspace(1) @hoist_addrspacecast(ptr addrspace(0) %arg) { +; CHECK-LABEL: define ptr addrspace(1) @hoist_addrspacecast( +; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RFL:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr [[ARG]]) +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[RFL]] to ptr addrspace(1) +; CHECK-NEXT: ret ptr addrspace(1) [[TMP0]] +; +bb: + %val = addrspacecast ptr addrspace(0) %arg to ptr addrspace(1) + %rfl = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) %val) + ret ptr addrspace(1) %rfl +} + +; test binary i32 + +define i32 @hoist_add_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_add_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = add i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_fadd_f32(float %arg) { +; CHECK-LABEL: define float @hoist_fadd_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fadd float [[TMP0]], 1.280000e+02 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fadd float %arg, 128.0 + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i32 @hoist_sub_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_sub_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP0]], -16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = sub i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_fsub_f32(float %arg) { +; CHECK-LABEL: define float @hoist_fsub_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fadd float [[TMP0]], -1.280000e+02 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fsub float %arg, 128.0 + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i32 @hoist_mul_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_mul_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = mul i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = mul i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_fmul_f32(float %arg) { +; CHECK-LABEL: define float @hoist_fmul_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fmul float [[TMP0]], 1.280000e+02 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fmul float %arg, 128.0 + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i32 @hoist_udiv_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_udiv_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = udiv i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = udiv i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_sdiv_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_sdiv_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = sdiv i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = sdiv i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_fdiv_f32(float %arg) { +; CHECK-LABEL: define float @hoist_fdiv_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fmul float [[TMP0]], 7.812500e-03 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fdiv float %arg, 128.0 + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i32 @hoist_urem_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_urem_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = urem i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = urem i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_srem_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_srem_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = srem i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = srem i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_frem_f32(float %arg) { +; CHECK-LABEL: define float @hoist_frem_f32( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = frem float [[TMP0]], 1.280000e+02 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = frem float %arg, 128.0 + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +define i32 @hoist_shl_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_shl_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = shl i32 [[TMP0]], 4 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = shl i32 %arg, 4 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_lshr_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_lshr_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = lshr i32 [[TMP0]], 4 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = lshr i32 %arg, 4 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_ashr_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_ashr_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = ashr i32 [[TMP0]], 4 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = ashr i32 %arg, 4 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + + +define i32 @hoist_and_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_and_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = and i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = and i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_or_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_or_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = or i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = or i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_xor_i32(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_xor_i32( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = xor i32 [[TMP0]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = xor i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +; test binary i64 + +define i64 @hoist_and_i64(i64 %arg) { +; CHECK-LABEL: define i64 @hoist_and_i64( +; CHECK-SAME: i64 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = and i64 [[TMP0]], 16777215 +; CHECK-NEXT: ret i64 [[RFL]] +; +bb: + %val = and i64 %arg, 16777215 + %rfl = call i64 @llvm.amdgcn.readfirstlane.i32(i64 %val) + ret i64 %rfl +} + +define double @hoist_fadd_f64(double %arg) { +; CHECK-LABEL: define double @hoist_fadd_f64( +; CHECK-SAME: double [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fadd double [[TMP0]], 1.280000e+02 +; CHECK-NEXT: ret double [[RFL]] +; +bb: + %val = fadd double %arg, 128.0 + %rfl = call double @llvm.amdgcn.readfirstlane.f64(double %val) + ret double %rfl +} + +; test constant on LHS + +define i32 @hoist_sub_i32_lhs(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_sub_i32_lhs( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = sub i32 16777215, [[TMP0]] +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = sub i32 16777215, %arg + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define float @hoist_fsub_f32_lhs(float %arg) { +; CHECK-LABEL: define float @hoist_fsub_f32_lhs( +; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = fsub float 1.280000e+02, [[TMP0]] +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fsub float 128.0, %arg + %rfl = call float @llvm.amdgcn.readfirstlane.f32(float %val) + ret float %rfl +} + +; test other operand is trivially uniform + +define i32 @hoist_add_i32_trivially_uniform_rhs(i32 %arg, i32 %v.other) { +; CHECK-LABEL: define i32 @hoist_add_i32_trivially_uniform_rhs( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[V_OTHER:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[OTHER:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V_OTHER]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP0]], [[OTHER]] +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %other = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %v.other) + %val = add i32 %arg, %other + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @hoist_add_i32_trivially_uniform_lhs(i32 %arg, i32 %v.other) { +; CHECK-LABEL: define i32 @hoist_add_i32_trivially_uniform_lhs( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[V_OTHER:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[OTHER:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V_OTHER]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[RFL:%.*]] = sub i32 [[OTHER]], [[TMP0]] +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %other = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %v.other) + %val = sub i32 %other, %arg + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +; test multiple iterations + +define i32 @hoist_multiple_times(i32 %arg) { +; CHECK-LABEL: define i32 @hoist_multiple_times( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 16777215, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 4242 +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP3]], 6 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val.0 = shl i32 %arg, 2 + %val.1 = sub i32 16777215, %val.0 + %val.2 = xor i32 %val.1, 4242 + %val.3 = add i32 %val.2, 6 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val.3) + ret i32 %rfl +} + +; test cases where hoisting isn't possible + +define i32 @cross_block_hoisting(i1 %cond, i32 %arg) { +; CHECK-LABEL: define i32 @cross_block_hoisting( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[VAL:%.*]] = add i32 [[ARG]], 16777215 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[VAL]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[RFL]], %[[THEN]] ], [ [[VAL]], %[[BB]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +bb: + %val = add i32 %arg, 16777215 + br i1 %cond, label %then, label %end + +then: + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + br label %end + +end: + %res = phi i32 [%rfl, %then], [%val, %bb] + ret i32 %res +} + +define i32 @operand_is_instr(i32 %arg, ptr %src) { +; CHECK-LABEL: define i32 @operand_is_instr( +; CHECK-SAME: i32 [[ARG:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[OTHER:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: [[VAL:%.*]] = add i32 [[ARG]], [[OTHER]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[VAL]]) +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %other = load i32, ptr %src + %val = add i32 %arg, %other + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +define i32 @operand_is_arg(i32 %arg, i32 %other) { +; CHECK-LABEL: define i32 @operand_is_arg( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[OTHER:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[VAL:%.*]] = add i32 [[ARG]], [[OTHER]] +; CHECK-NEXT: [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[VAL]]) +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = add i32 %arg, %other + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) + ret i32 %rfl +} + +; test that convergence tokens are preserved + +define i32 @hoist_preserves_convergence_token(i1 %cond, i32 %arg) convergent { +; CHECK-LABEL: define i32 @hoist_preserves_convergence_token( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[ENTRY:%.*]] = call token @llvm.experimental.convergence.entry() +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]]) [ "convergencectrl"(token [[ENTRY]]) ] +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[TMP0]], 16777215 +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[RFL]], %[[THEN]] ], [ [[ARG]], %[[BB]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +bb: + %entry = call token @llvm.experimental.convergence.entry() + br i1 %cond, label %then, label %end + +then: + %val = add i32 %arg, 16777215 + %rfl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val) [ "convergencectrl"(token %entry)] + br label %end + +end: + %res = phi i32 [%rfl, %then], [%arg, %bb] + ret i32 %res +} diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readlane.ll new file mode 100644 index 0000000000000..a9ac4bc93fd3c --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.readlane.ll @@ -0,0 +1,232 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 -passes=instcombine -S < %s | FileCheck %s + +; The readfirstlane version of this test covers all the interesting cases of the +; shared logic. This testcase focuses on readlane specific pitfalls. + +; test unary + +define float @hoist_fneg_f32(float %arg, i32 %lane) { +; CHECK-LABEL: define float @hoist_fneg_f32( +; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = fneg float [[RL]] +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fneg float %arg + %rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane) + ret float %rl +} + +define double @hoist_fneg_f64(double %arg, i32 %lane) { +; CHECK-LABEL: define double @hoist_fneg_f64( +; CHECK-SAME: double [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = fneg double [[RL]] +; CHECK-NEXT: ret double [[RFL]] +; +bb: + %val = fneg double %arg + %rl = call double @llvm.amdgcn.readlane.f64(double %val, i32 %lane) + ret double %rl +} + +; test casts + +define i32 @hoist_trunc(i64 %arg, i32 %lane) { +; CHECK-LABEL: define i32 @hoist_trunc( +; CHECK-SAME: i64 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[RL]] to i32 +; CHECK-NEXT: ret i32 [[TMP0]] +; +bb: + %val = trunc i64 %arg to i32 + %rl = call i32 @llvm.amdgcn.readlane.i32(i32 %val, i32 %lane) + ret i32 %rl +} + +define i64 @hoist_zext(i32 %arg, i32 %lane) { +; CHECK-LABEL: define i64 @hoist_zext( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[RL]] to i64 +; CHECK-NEXT: ret i64 [[TMP0]] +; +bb: + %val = zext i32 %arg to i64 + %rl = call i64 @llvm.amdgcn.readlane.i64(i64 %val, i32 %lane) + ret i64 %rl +} + +; test binary i32 + +define i32 @hoist_add_i32(i32 %arg, i32 %lane) { +; CHECK-LABEL: define i32 @hoist_add_i32( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = add i32 [[RL]], 16777215 +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = add i32 %arg, 16777215 + %rl = call i32 @llvm.amdgcn.readlane.i32(i32 %val, i32 %lane) + ret i32 %rl +} + +define float @hoist_fadd_f32(float %arg, i32 %lane) { +; CHECK-LABEL: define float @hoist_fadd_f32( +; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = fadd float [[RL]], 1.280000e+02 +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fadd float %arg, 128.0 + %rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane) + ret float %rl +} + +; test binary i64 + +define i64 @hoist_and_i64(i64 %arg, i32 %lane) { +; CHECK-LABEL: define i64 @hoist_and_i64( +; CHECK-SAME: i64 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call i64 @llvm.amdgcn.readlane.i64(i64 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = and i64 [[RL]], 16777215 +; CHECK-NEXT: ret i64 [[RFL]] +; +bb: + %val = and i64 %arg, 16777215 + %rl = call i64 @llvm.amdgcn.readlane.i32(i64 %val, i32 %lane) + ret i64 %rl +} + +define double @hoist_fadd_f64(double %arg, i32 %lane) { +; CHECK-LABEL: define double @hoist_fadd_f64( +; CHECK-SAME: double [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call double @llvm.amdgcn.readlane.f64(double [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = fadd double [[RL]], 1.280000e+02 +; CHECK-NEXT: ret double [[RFL]] +; +bb: + %val = fadd double %arg, 128.0 + %rl = call double @llvm.amdgcn.readlane.f64(double %val, i32 %lane) + ret double %rl +} + +; test constant on LHS + +define i32 @hoist_sub_i32_lhs(i32 %arg, i32 %lane) { +; CHECK-LABEL: define i32 @hoist_sub_i32_lhs( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = sub i32 16777215, [[RL]] +; CHECK-NEXT: ret i32 [[RFL]] +; +bb: + %val = sub i32 16777215, %arg + %rl = call i32 @llvm.amdgcn.readlane.i32(i32 %val, i32 %lane) + ret i32 %rl +} + +define float @hoist_fsub_f32_lhs(float %arg, i32 %lane) { +; CHECK-LABEL: define float @hoist_fsub_f32_lhs( +; CHECK-SAME: float [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[RL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[RFL:%.*]] = fsub float 1.280000e+02, [[RL]] +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fsub float 128.0, %arg + %rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane) + ret float %rl +} + +define i32 @readlane_lane_op_in_other_block(i1 %cond, i32 %arg, i32 %base) { +; CHECK-LABEL: define i32 @readlane_lane_op_in_other_block( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]], i32 [[BASE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[LANE:%.*]] = add i32 [[BASE]], 2 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[RL]], 16777215 +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[TMP0]], %[[THEN]] ], [ [[LANE]], %[[BB]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +bb: + %lane = add i32 %base, 2 + br i1 %cond, label %then, label %end + +then: + %val = add i32 %arg, 16777215 + %rl = call i32 @llvm.amdgcn.readlane.i32(i32 %val, i32 %lane) + br label %end + +end: + %res = phi i32 [%rl, %then], [%lane, %bb] + ret i32 %res +} + +; Check cases where we can't move the readlane higher + +define float @cannot_move_readlane(float %arg, i32 %base) { +; CHECK-LABEL: define float @cannot_move_readlane( +; CHECK-SAME: float [[ARG:%.*]], i32 [[BASE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[VAL:%.*]] = fsub float 1.280000e+02, [[ARG]] +; CHECK-NEXT: [[LANE:%.*]] = add i32 [[BASE]], 2 +; CHECK-NEXT: [[RFL:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE]]) +; CHECK-NEXT: ret float [[RFL]] +; +bb: + %val = fsub float 128.0, %arg + %lane = add i32 %base, 2 + %rl = call float @llvm.amdgcn.readlane.f32(float %val, i32 %lane) + ret float %rl +} + + +; test that convergence tokens are preserved + +define i32 @hoist_preserves_convergence_token(i1 %cond, i32 %arg, i32 %lane) convergent { +; CHECK-LABEL: define i32 @hoist_preserves_convergence_token( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[ARG:%.*]], i32 [[LANE:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[ENTRY:%.*]] = call token @llvm.experimental.convergence.entry() +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[RL:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[LANE]]) [ "convergencectrl"(token [[ENTRY]]) ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[RL]], 16777215 +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[TMP0]], %[[THEN]] ], [ [[ARG]], %[[BB]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +bb: + %entry = call token @llvm.experimental.convergence.entry() + br i1 %cond, label %then, label %end + +then: + %val = add i32 %arg, 16777215 + %rl = call i32 @llvm.amdgcn.readlane.i32(i32 %val, i32 %lane) [ "convergencectrl"(token %entry)] + br label %end + +end: + %res = phi i32 [%rl, %then], [%arg, %bb] + ret i32 %res +}