Skip to content

Commit 1b5ca05

Browse files
[AMDGPU] Hoist readlane/readfirstlane through unary/binary operands (… (llvm#3457)…llvm#129037)
When a read(first)lane is used on a binary operator and the intrinsic is the only user of the operator, we can move the read(first)lane into the operand if the other operand is uniform. Unfortunately IC doesn't let us access UniformityAnalysis and thus we can't truly check uniformity, we have to do with a basic uniformity check which only allows constants or trivially uniform intrinsics calls. We can also do the same for unary and cast operators. Co-authored-by: Pierre van Houtryve <[email protected]>
1 parent 2b07f34 commit 1b5ca05

File tree

5 files changed

+1010
-1
lines changed

5 files changed

+1010
-1
lines changed

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
465465

466466
bool useAA() const { return getST()->useAA(); }
467467

468-
bool isTypeLegal(Type *Ty) {
468+
bool isTypeLegal(Type *Ty) const {
469469
EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
470470
return getTLI()->isTypeLegal(VT);
471471
}

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "AMDGPUTargetTransformInfo.h"
1919
#include "GCNSubtarget.h"
2020
#include "llvm/ADT/FloatingPointMode.h"
21+
#include "llvm/IR/Dominators.h"
2122
#include "llvm/IR/IntrinsicsAMDGPU.h"
2223
#include "llvm/Transforms/InstCombine/InstCombiner.h"
2324
#include <optional>
@@ -481,6 +482,98 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
481482
return false;
482483
}
483484

485+
static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
486+
Function &NewCallee, ArrayRef<Value *> Ops) {
487+
SmallVector<OperandBundleDef, 2> OpBundles;
488+
Old.getOperandBundlesAsDefs(OpBundles);
489+
490+
CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
491+
NewCall->takeName(&Old);
492+
return NewCall;
493+
}
494+
495+
Instruction *
496+
GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
497+
IntrinsicInst &II) const {
498+
const auto IID = II.getIntrinsicID();
499+
assert(IID == Intrinsic::amdgcn_readlane ||
500+
IID == Intrinsic::amdgcn_readfirstlane ||
501+
IID == Intrinsic::amdgcn_permlane64);
502+
503+
Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
504+
505+
// Only do this if both instructions are in the same block
506+
// (so the exec mask won't change) and the readlane is the only user of its
507+
// operand.
508+
if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
509+
return nullptr;
510+
511+
const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
512+
513+
// If this is a readlane, check that the second operand is a constant, or is
514+
// defined before OpInst so we know it's safe to move this intrinsic higher.
515+
Value *LaneID = nullptr;
516+
if (IsReadLane) {
517+
LaneID = II.getOperand(1);
518+
519+
// readlane take an extra operand for the lane ID, so we must check if that
520+
// LaneID value can be used at the point where we want to move the
521+
// intrinsic.
522+
if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
523+
if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
524+
return nullptr;
525+
}
526+
}
527+
528+
// Hoist the intrinsic (II) through OpInst.
529+
//
530+
// (II (OpInst x)) -> (OpInst (II x))
531+
const auto DoIt = [&](unsigned OpIdx,
532+
Function *NewIntrinsic) -> Instruction * {
533+
SmallVector<Value *, 2> Ops{OpInst->getOperand(OpIdx)};
534+
if (IsReadLane)
535+
Ops.push_back(LaneID);
536+
537+
// Rewrite the intrinsic call.
538+
CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
539+
540+
// Rewrite OpInst so it takes the result of the intrinsic now.
541+
Instruction &NewOp = *OpInst->clone();
542+
NewOp.setOperand(OpIdx, NewII);
543+
return &NewOp;
544+
};
545+
546+
// TODO(?): Should we do more with permlane64?
547+
if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
548+
return nullptr;
549+
550+
if (isa<UnaryOperator>(OpInst))
551+
return DoIt(0, II.getCalledFunction());
552+
553+
if (isa<CastInst>(OpInst)) {
554+
Value *Src = OpInst->getOperand(0);
555+
Type *SrcTy = Src->getType();
556+
if (!isTypeLegal(SrcTy))
557+
return nullptr;
558+
559+
Function *Remangled =
560+
Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
561+
return DoIt(0, Remangled);
562+
}
563+
564+
// We can also hoist through binary operators if the other operand is uniform.
565+
if (isa<BinaryOperator>(OpInst)) {
566+
// FIXME: If we had access to UniformityInfo here we could just check
567+
// if the operand is uniform.
568+
if (isTriviallyUniform(OpInst->getOperandUse(0)))
569+
return DoIt(1, II.getCalledFunction());
570+
if (isTriviallyUniform(OpInst->getOperandUse(1)))
571+
return DoIt(0, II.getCalledFunction());
572+
}
573+
574+
return nullptr;
575+
}
576+
484577
std::optional<Instruction *>
485578
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
486579
Intrinsic::ID IID = II.getIntrinsicID();
@@ -1151,6 +1244,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
11511244
simplifyDemandedLaneMaskArg(IC, II, 1))
11521245
return &II;
11531246

1247+
if (IID != Intrinsic::amdgcn_ds_bpermute) {
1248+
if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
1249+
return Res;
1250+
}
1251+
11541252
return std::nullopt;
11551253
}
11561254
case Intrinsic::amdgcn_writelane: {

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,10 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
227227

228228
std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
229229
IntrinsicInst &II) const;
230+
231+
Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
232+
IntrinsicInst &II) const;
233+
230234
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
231235
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
232236
APInt &UndefElts2, APInt &UndefElts3,

0 commit comments

Comments
 (0)