1818#include " AMDGPUTargetTransformInfo.h"
1919#include " GCNSubtarget.h"
2020#include " llvm/ADT/FloatingPointMode.h"
21+ #include " llvm/IR/Dominators.h"
2122#include " llvm/IR/IntrinsicsAMDGPU.h"
2223#include " llvm/Transforms/InstCombine/InstCombiner.h"
2324#include < optional>
@@ -503,6 +504,98 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
503504 return false ;
504505}
505506
507+ static CallInst *rewriteCall (IRBuilderBase &B, CallInst &Old,
508+ Function &NewCallee, ArrayRef<Value *> Ops) {
509+ SmallVector<OperandBundleDef, 2 > OpBundles;
510+ Old.getOperandBundlesAsDefs (OpBundles);
511+
512+ CallInst *NewCall = B.CreateCall (&NewCallee, Ops, OpBundles);
513+ NewCall->takeName (&Old);
514+ return NewCall;
515+ }
516+
517+ Instruction *
518+ GCNTTIImpl::hoistLaneIntrinsicThroughOperand (InstCombiner &IC,
519+ IntrinsicInst &II) const {
520+ const auto IID = II.getIntrinsicID ();
521+ assert (IID == Intrinsic::amdgcn_readlane ||
522+ IID == Intrinsic::amdgcn_readfirstlane ||
523+ IID == Intrinsic::amdgcn_permlane64);
524+
525+ Instruction *OpInst = dyn_cast<Instruction>(II.getOperand (0 ));
526+
527+ // Only do this if both instructions are in the same block
528+ // (so the exec mask won't change) and the readlane is the only user of its
529+ // operand.
530+ if (!OpInst || !OpInst->hasOneUser () || OpInst->getParent () != II.getParent ())
531+ return nullptr ;
532+
533+ const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
534+
535+ // If this is a readlane, check that the second operand is a constant, or is
536+ // defined before OpInst so we know it's safe to move this intrinsic higher.
537+ Value *LaneID = nullptr ;
538+ if (IsReadLane) {
539+ LaneID = II.getOperand (1 );
540+
541+ // readlane take an extra operand for the lane ID, so we must check if that
542+ // LaneID value can be used at the point where we want to move the
543+ // intrinsic.
544+ if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
545+ if (!IC.getDominatorTree ().dominates (LaneIDInst, OpInst))
546+ return nullptr ;
547+ }
548+ }
549+
550+ // Hoist the intrinsic (II) through OpInst.
551+ //
552+ // (II (OpInst x)) -> (OpInst (II x))
553+ const auto DoIt = [&](unsigned OpIdx,
554+ Function *NewIntrinsic) -> Instruction * {
555+ SmallVector<Value *, 2 > Ops{OpInst->getOperand (OpIdx)};
556+ if (IsReadLane)
557+ Ops.push_back (LaneID);
558+
559+ // Rewrite the intrinsic call.
560+ CallInst *NewII = rewriteCall (IC.Builder , II, *NewIntrinsic, Ops);
561+
562+ // Rewrite OpInst so it takes the result of the intrinsic now.
563+ Instruction &NewOp = *OpInst->clone ();
564+ NewOp.setOperand (OpIdx, NewII);
565+ return &NewOp;
566+ };
567+
568+ // TODO(?): Should we do more with permlane64?
569+ if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
570+ return nullptr ;
571+
572+ if (isa<UnaryOperator>(OpInst))
573+ return DoIt (0 , II.getCalledFunction ());
574+
575+ if (isa<CastInst>(OpInst)) {
576+ Value *Src = OpInst->getOperand (0 );
577+ Type *SrcTy = Src->getType ();
578+ if (!isTypeLegal (SrcTy))
579+ return nullptr ;
580+
581+ Function *Remangled =
582+ Intrinsic::getOrInsertDeclaration (II.getModule (), IID, {SrcTy});
583+ return DoIt (0 , Remangled);
584+ }
585+
586+ // We can also hoist through binary operators if the other operand is uniform.
587+ if (isa<BinaryOperator>(OpInst)) {
588+ // FIXME: If we had access to UniformityInfo here we could just check
589+ // if the operand is uniform.
590+ if (isTriviallyUniform (OpInst->getOperandUse (0 )))
591+ return DoIt (1 , II.getCalledFunction ());
592+ if (isTriviallyUniform (OpInst->getOperandUse (1 )))
593+ return DoIt (0 , II.getCalledFunction ());
594+ }
595+
596+ return nullptr ;
597+ }
598+
506599std::optional<Instruction *>
507600GCNTTIImpl::instCombineIntrinsic (InstCombiner &IC, IntrinsicInst &II) const {
508601 Intrinsic::ID IID = II.getIntrinsicID ();
@@ -1264,31 +1357,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
12641357 simplifyDemandedLaneMaskArg (IC, II, 1 ))
12651358 return &II;
12661359
1267- // readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
1268- if (auto *BC = dyn_cast<BitCastInst>(Src);
1269- BC && BC->hasOneUse () && IID != Intrinsic::amdgcn_ds_bpermute) {
1270- Value *BCSrc = BC->getOperand (0 );
1271-
1272- // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
1273- if (isTypeLegal (BCSrc->getType ())) {
1274- Module *M = IC.Builder .GetInsertBlock ()->getModule ();
1275- Function *Remangled =
1276- Intrinsic::getOrInsertDeclaration (M, IID, {BCSrc->getType ()});
1277-
1278- // Make sure convergence tokens are preserved.
1279- // TODO: CreateIntrinsic should allow directly copying bundles
1280- SmallVector<OperandBundleDef, 2 > OpBundles;
1281- II.getOperandBundlesAsDefs (OpBundles);
1282-
1283- SmallVector<Value *, 3 > Args (II.args ());
1284- Args[0 ] = BCSrc;
1285-
1286- CallInst *NewCall = IC.Builder .CreateCall (Remangled, Args, OpBundles);
1287- NewCall->takeName (&II);
1288- return new BitCastInst (NewCall, II.getType ());
1289- }
1290- }
1291-
12921360 // If the lane argument of bpermute is uniform, change it to readlane. This
12931361 // generates better code and can enable further optimizations because
12941362 // readlane is AlwaysUniform.
@@ -1305,6 +1373,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
13051373 }
13061374 }
13071375
1376+ if (IID != Intrinsic::amdgcn_ds_bpermute) {
1377+ if (Instruction *Res = hoistLaneIntrinsicThroughOperand (IC, II))
1378+ return Res;
1379+ }
1380+
13081381 return std::nullopt ;
13091382 }
13101383 case Intrinsic::amdgcn_writelane: {
0 commit comments