Skip to content

Commit cc10c1a

Browse files
committed
[AMDGPU] Hoist readlane/readfirst through unary/binary operands
When a read(first)lane is used on a binary operator and the intrinsic is the only user of the operator, we can move the read(first)lane into the operand if the other operand is uniform. Unfortunately IC doesn't let us access UniformityAnalysis and thus we can't truly check uniformity, we have to do with a basic uniformity check which only allows constants or trivially uniform intrinsics calls. We can also do the same for simple unary operations.
1 parent 99cb3f7 commit cc10c1a

File tree

4 files changed

+666
-0
lines changed

4 files changed

+666
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,59 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
481481
return false;
482482
}
483483

484+
Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
485+
IntrinsicInst &II) const {
486+
Instruction *Op = dyn_cast<Instruction>(II.getOperand(0));
487+
488+
// Only do this if both instructions are in the same block
489+
// (so the exec mask won't change) and the readlane is the only user of its
490+
// operand.
491+
if (!Op || !Op->hasOneUser() || Op->getParent() != II.getParent())
492+
return nullptr;
493+
494+
const bool IsReadLane = (II.getIntrinsicID() == Intrinsic::amdgcn_readlane);
495+
496+
// If this is a readlane, check that the second operand is a constant, or is
497+
// defined before Op so we know it's safe to move this intrinsic higher.
498+
Value *LaneID = nullptr;
499+
if (IsReadLane) {
500+
LaneID = II.getOperand(1);
501+
if (!isa<Constant>(LaneID) && !(isa<Instruction>(LaneID) &&
502+
cast<Instruction>(LaneID)->comesBefore(Op)))
503+
return nullptr;
504+
}
505+
506+
const auto DoIt = [&](unsigned OpIdx) -> Instruction * {
507+
SmallVector<Value *, 2> Ops{Op->getOperand(OpIdx)};
508+
if (IsReadLane)
509+
Ops.push_back(LaneID);
510+
511+
Instruction *NewII =
512+
IC.Builder.CreateIntrinsic(II.getType(), II.getIntrinsicID(), Ops);
513+
514+
Instruction &NewOp = *Op->clone();
515+
NewOp.setOperand(OpIdx, NewII);
516+
return &NewOp;
517+
};
518+
519+
// TODO: Are any operations more expensive on the SALU than VALU, and thus
520+
// need to be excluded here?
521+
522+
if (isa<UnaryOperator>(Op))
523+
return DoIt(0);
524+
525+
if (isa<BinaryOperator>(Op)) {
526+
// FIXME: If we had access to UniformityInfo here we could just check
527+
// if the operand is uniform.
528+
if (isTriviallyUniform(Op->getOperandUse(0)))
529+
return DoIt(1);
530+
if (isTriviallyUniform(Op->getOperandUse(1)))
531+
return DoIt(0);
532+
}
533+
534+
return nullptr;
535+
}
536+
484537
std::optional<Instruction *>
485538
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
486539
Intrinsic::ID IID = II.getIntrinsicID();
@@ -1214,6 +1267,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
12141267
}
12151268
}
12161269

1270+
// If the readfirstlane reads the result of an operation that exists
1271+
// both in the SALU and VALU, we may be able to hoist it higher in order
1272+
// to scalarize the expression.
1273+
if (Instruction *Res = hoistReadLaneThroughOperand(IC, II))
1274+
return Res;
1275+
12171276
return std::nullopt;
12181277
}
12191278
case Intrinsic::amdgcn_writelane: {

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
236236
const APInt &DemandedElts,
237237
APInt &UndefElts) const;
238238

239+
Instruction *hoistReadLaneThroughOperand(InstCombiner &IC,
240+
IntrinsicInst &II) const;
241+
239242
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
240243
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
241244
APInt &UndefElts2, APInt &UndefElts3,

0 commit comments

Comments
 (0)