@@ -482,8 +482,14 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
482482 return false ;
483483}
484484
485- Instruction *GCNTTIImpl::hoistReadLaneThroughOperand (InstCombiner &IC,
486- IntrinsicInst &II) const {
485+ Instruction *
486+ GCNTTIImpl::hoistLaneIntrinsicThroughOperand (InstCombiner &IC,
487+ IntrinsicInst &II) const {
488+ const auto IID = II.getIntrinsicID ();
489+ assert (IID == Intrinsic::amdgcn_readlane ||
490+ IID == Intrinsic::amdgcn_readfirstlane ||
491+ IID == Intrinsic::amdgcn_permlane64);
492+
487493 Instruction *Op = dyn_cast<Instruction>(II.getOperand (0 ));
488494
489495 // Only do this if both instructions are in the same block
@@ -492,7 +498,8 @@ Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
492498 if (!Op || !Op->hasOneUser () || Op->getParent () != II.getParent ())
493499 return nullptr ;
494500
495- const bool IsReadLane = (II.getIntrinsicID () == Intrinsic::amdgcn_readlane);
501+ const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
502+ const bool IsPermLane = (IID == Intrinsic::amdgcn_permlane64);
496503
497504 // If this is a readlane, check that the second operand is a constant, or is
498505 // defined before Op so we know it's safe to move this intrinsic higher.
@@ -505,7 +512,8 @@ Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
505512 return nullptr ;
506513 }
507514
508- const auto DoIt = [&](unsigned OpIdx) -> Instruction * {
515+ const auto DoIt = [&](unsigned OpIdx,
516+ Function *NewIntrinsic) -> Instruction * {
509517 SmallVector<Value *, 2 > Ops{Op->getOperand (OpIdx)};
510518 if (IsReadLane)
511519 Ops.push_back (LaneID);
@@ -515,27 +523,40 @@ Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
515523 SmallVector<OperandBundleDef, 2 > OpBundles;
516524 II.getOperandBundlesAsDefs (OpBundles);
517525
518- CallInst *NewII =
519- IC. Builder . CreateCall (II. getCalledFunction (), Ops, OpBundles );
526+ CallInst *NewII = IC. Builder . CreateCall (NewIntrinsic, Ops, OpBundles);
527+ NewII-> takeName (&II );
520528
521529 Instruction &NewOp = *Op->clone ();
522530 NewOp.setOperand (OpIdx, NewII);
523531 return &NewOp;
524532 };
525533
526- // TODO: Are any operations more expensive on the SALU than VALU, and thus
527- // need to be excluded here?
528-
529534 if (isa<UnaryOperator>(Op))
530- return DoIt (0 );
535+ return DoIt (0 , II.getCalledFunction ());
536+
537+ if (isa<CastInst>(Op)) {
538+ Value *Src = Op->getOperand (0 );
539+ Type *SrcTy = Src->getType ();
540+ if (!isTypeLegal (SrcTy))
541+ return nullptr ;
542+
543+ Function *Remangled =
544+ Intrinsic::getOrInsertDeclaration (II.getModule (), IID, {SrcTy});
545+ return DoIt (0 , Remangled);
546+ }
531547
532- if (isa<BinaryOperator>(Op)) {
548+ // Don't hoist through a binary operator for permlane64. It doesn't
549+ // achieve anything and we'd need to repeat the call on every operand.
550+ //
551+ // We can do it for read(first)lane if other operands are already scalar
552+ // because then we don't need to repeat the call.
553+ if (!IsPermLane && isa<BinaryOperator>(Op)) {
533554 // FIXME: If we had access to UniformityInfo here we could just check
534555 // if the operand is uniform.
535556 if (isTriviallyUniform (Op->getOperandUse (0 )))
536- return DoIt (1 );
557+ return DoIt (1 , II. getCalledFunction () );
537558 if (isTriviallyUniform (Op->getOperandUse (1 )))
538- return DoIt (0 );
559+ return DoIt (0 , II. getCalledFunction () );
539560 }
540561
541562 return nullptr ;
@@ -1233,31 +1254,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
12331254 simplifyDemandedLaneMaskArg (IC, II, 1 ))
12341255 return &II;
12351256
1236- // readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
1237- if (auto *BC = dyn_cast<BitCastInst>(Src);
1238- BC && BC->hasOneUse () && IID != Intrinsic::amdgcn_ds_bpermute) {
1239- Value *BCSrc = BC->getOperand (0 );
1240-
1241- // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
1242- if (isTypeLegal (BCSrc->getType ())) {
1243- Module *M = IC.Builder .GetInsertBlock ()->getModule ();
1244- Function *Remangled =
1245- Intrinsic::getOrInsertDeclaration (M, IID, {BCSrc->getType ()});
1246-
1247- // Make sure convergence tokens are preserved.
1248- // TODO: CreateIntrinsic should allow directly copying bundles
1249- SmallVector<OperandBundleDef, 2 > OpBundles;
1250- II.getOperandBundlesAsDefs (OpBundles);
1251-
1252- SmallVector<Value *, 3 > Args (II.args ());
1253- Args[0 ] = BCSrc;
1254-
1255- CallInst *NewCall = IC.Builder .CreateCall (Remangled, Args, OpBundles);
1256- NewCall->takeName (&II);
1257- return new BitCastInst (NewCall, II.getType ());
1258- }
1259- }
1260-
12611257 // If the lane argument of bpermute is uniform, change it to readlane. This
12621258 // generates better code and can enable further optimizations because
12631259 // readlane is AlwaysUniform.
@@ -1274,13 +1270,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
12741270 }
12751271 }
12761272
1277- // If the readfirstlane reads the result of an operation that exists
1278- // both in the SALU and VALU, we may be able to hoist it higher in order
1279- // to scalarize the expression.
1280- if (IID != Intrinsic::amdgcn_permlane64) {
1281- if (Instruction *Res = hoistReadLaneThroughOperand (IC, II))
1282- return Res;
1283- }
1273+ if (Instruction *Res = hoistLaneIntrinsicThroughOperand (IC, II))
1274+ return Res;
12841275
12851276 return std::nullopt ;
12861277 }
0 commit comments