@@ -543,6 +543,11 @@ class InnerLoopVectorizer {
543543protected:
544544 friend class LoopVectorizationPlanner ;
545545
546+ // / Set up the values of the IVs correctly when exiting the vector loop.
547+ virtual void fixupIVUsers (PHINode *OrigPhi, const InductionDescriptor &II,
548+ Value *VectorTripCount, BasicBlock *MiddleBlock,
549+ VPTransformState &State);
550+
546551 // / Iteratively sink the scalarized operands of a predicated instruction into
547552 // / the block that was created for it.
548553 void sinkScalarOperands (Instruction *PredInst);
@@ -780,6 +785,10 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
780785 BasicBlock *emitIterationCountCheck (BasicBlock *Bypass, bool ForEpilogue);
781786 void printDebugTracesAtStart () override ;
782787 void printDebugTracesAtEnd () override ;
788+
789+ void fixupIVUsers (PHINode *OrigPhi, const InductionDescriptor &II,
790+ Value *VectorTripCount, BasicBlock *MiddleBlock,
791+ VPTransformState &State) override {};
783792};
784793
785794// A specialized derived class of inner loop vectorizer that performs
@@ -2773,6 +2782,97 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
27732782 return LoopVectorPreHeader;
27742783}
27752784
2785+ // Fix up external users of the induction variable. At this point, we are
2786+ // in LCSSA form, with all external PHIs that use the IV having one input value,
2787+ // coming from the remainder loop. We need those PHIs to also have a correct
2788+ // value for the IV when arriving directly from the middle block.
2789+ void InnerLoopVectorizer::fixupIVUsers (PHINode *OrigPhi,
2790+ const InductionDescriptor &II,
2791+ Value *VectorTripCount,
2792+ BasicBlock *MiddleBlock,
2793+ VPTransformState &State) {
2794+ // There are two kinds of external IV usages - those that use the value
2795+ // computed in the last iteration (the PHI) and those that use the penultimate
2796+ // value (the value that feeds into the phi from the loop latch).
2797+ // We allow both, but they, obviously, have different values.
2798+
2799+ DenseMap<Value *, Value *> MissingVals;
2800+
2801+ Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock (
2802+ OrigLoop->getLoopPreheader ()))
2803+ ->getIncomingValueForBlock (MiddleBlock);
2804+
2805+ // An external user of the last iteration's value should see the value that
2806+ // the remainder loop uses to initialize its own IV.
2807+ Value *PostInc = OrigPhi->getIncomingValueForBlock (OrigLoop->getLoopLatch ());
2808+ for (User *U : PostInc->users ()) {
2809+ Instruction *UI = cast<Instruction>(U);
2810+ if (!OrigLoop->contains (UI)) {
2811+ assert (isa<PHINode>(UI) && " Expected LCSSA form" );
2812+ MissingVals[UI] = EndValue;
2813+ }
2814+ }
2815+
2816+ // An external user of the penultimate value need to see EndValue - Step.
2817+ // The simplest way to get this is to recompute it from the constituent SCEVs,
2818+ // that is Start + (Step * (CRD - 1)).
2819+ for (User *U : OrigPhi->users ()) {
2820+ auto *UI = cast<Instruction>(U);
2821+ if (!OrigLoop->contains (UI)) {
2822+ assert (isa<PHINode>(UI) && " Expected LCSSA form" );
2823+ IRBuilder<> B (MiddleBlock->getTerminator ());
2824+
2825+ // Fast-math-flags propagate from the original induction instruction.
2826+ if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp ()))
2827+ B.setFastMathFlags (II.getInductionBinOp ()->getFastMathFlags ());
2828+
2829+ VPValue *StepVPV = Plan.getSCEVExpansion (II.getStep ());
2830+ assert (StepVPV && " step must have been expanded during VPlan execution" );
2831+ Value *Step = StepVPV->isLiveIn () ? StepVPV->getLiveInIRValue ()
2832+ : State.get (StepVPV, VPLane (0 ));
2833+ Value *Escape = nullptr ;
2834+ if (EndValue->getType ()->isIntegerTy ())
2835+ Escape = B.CreateSub (EndValue, Step);
2836+ else if (EndValue->getType ()->isPointerTy ())
2837+ Escape = B.CreatePtrAdd (EndValue, B.CreateNeg (Step));
2838+ else {
2839+ assert (EndValue->getType ()->isFloatingPointTy () &&
2840+ " Unexpected induction type" );
2841+ Escape = B.CreateBinOp (II.getInductionBinOp ()->getOpcode () ==
2842+ Instruction::FAdd
2843+ ? Instruction::FSub
2844+ : Instruction::FAdd,
2845+ EndValue, Step);
2846+ }
2847+ Escape->setName (" ind.escape" );
2848+ MissingVals[UI] = Escape;
2849+ }
2850+ }
2851+
2852+ assert ((MissingVals.empty () ||
2853+ all_of (MissingVals,
2854+ [MiddleBlock, this ](const std::pair<Value *, Value *> &P) {
2855+ return all_of (
2856+ predecessors (cast<Instruction>(P.first )->getParent ()),
2857+ [MiddleBlock, this ](BasicBlock *Pred) {
2858+ return Pred == MiddleBlock ||
2859+ Pred == OrigLoop->getLoopLatch ();
2860+ });
2861+ })) &&
2862+ " Expected escaping values from latch/middle.block only" );
2863+
2864+ for (auto &I : MissingVals) {
2865+ PHINode *PHI = cast<PHINode>(I.first );
2866+ // One corner case we have to handle is two IVs "chasing" each-other,
2867+ // that is %IV2 = phi [...], [ %IV1, %latch ]
2868+ // In this case, if IV1 has an external use, we need to avoid adding both
2869+ // "last value of IV1" and "penultimate value of IV2". So, verify that we
2870+ // don't already have an incoming value for the middle block.
2871+ if (PHI->getBasicBlockIndex (MiddleBlock) == -1 )
2872+ PHI->addIncoming (I.second , MiddleBlock);
2873+ }
2874+ }
2875+
27762876namespace {
27772877
27782878struct CSEDenseMapInfo {
@@ -2899,6 +2999,24 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
28992999 for (PHINode &PN : Exit->phis ())
29003000 PSE.getSE ()->forgetLcssaPhiWithNewPredecessor (OrigLoop, &PN);
29013001
3002+ if (Cost->requiresScalarEpilogue (VF.isVector ())) {
3003+ // No edge from the middle block to the unique exit block has been inserted
3004+ // and there is nothing to fix from vector loop; phis should have incoming
3005+ // from scalar loop only.
3006+ } else {
3007+ // TODO: Check in VPlan to see if IV users need fixing instead of checking
3008+ // the cost model.
3009+
3010+ // If we inserted an edge from the middle block to the unique exit block,
3011+ // update uses outside the loop (phis) to account for the newly inserted
3012+ // edge.
3013+
3014+ // Fix-up external users of the induction variables.
3015+ for (const auto &Entry : Legal->getInductionVars ())
3016+ fixupIVUsers (Entry.first , Entry.second ,
3017+ getOrCreateVectorTripCount (nullptr ), LoopMiddleBlock, State);
3018+ }
3019+
29023020 // Don't apply optimizations below when no vector region remains, as they all
29033021 // require a vector loop at the moment.
29043022 if (!State.Plan ->getVectorLoopRegion ())
@@ -8931,9 +9049,11 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
89319049// / Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
89329050// / induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
89339051// / the end value of the induction.
8934- static VPInstruction *addResumePhiRecipeForInduction (
8935- VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8936- VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
9052+ static VPValue *addResumePhiRecipeForInduction (VPWidenInductionRecipe *WideIV,
9053+ VPBuilder &VectorPHBuilder,
9054+ VPBuilder &ScalarPHBuilder,
9055+ VPTypeAnalysis &TypeInfo,
9056+ VPValue *VectorTC) {
89379057 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
89389058 // Truncated wide inductions resume from the last lane of their vector value
89399059 // in the last vector iteration which is handled elsewhere.
@@ -8967,10 +9087,8 @@ static VPInstruction *addResumePhiRecipeForInduction(
89679087
89689088// / Create resume phis in the scalar preheader for first-order recurrences,
89699089// / reductions and inductions, and update the VPIRInstructions wrapping the
8970- // / original phis in the scalar header. End values for inductions are added to
8971- // / \p IVEndValues.
8972- static void addScalarResumePhis (VPRecipeBuilder &Builder, VPlan &Plan,
8973- DenseMap<VPValue *, VPValue *> &IVEndValues) {
9090+ // / original phis in the scalar header.
9091+ static void addScalarResumePhis (VPRecipeBuilder &Builder, VPlan &Plan) {
89749092 VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
89759093 auto *ScalarPH = Plan.getScalarPreheader ();
89769094 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor ());
@@ -8987,16 +9105,11 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
89879105 if (!ScalarPhiI)
89889106 break ;
89899107
8990- // TODO: Extract final value from induction recipe initially, optimize to
8991- // pre-computed end value together in optimizeInductionExitUsers.
89929108 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe (ScalarPhiI));
89939109 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
8994- if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction (
9110+ if (VPValue *ResumePhi = addResumePhiRecipeForInduction (
89959111 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
89969112 &Plan.getVectorTripCount ())) {
8997- assert (ResumePhi->getOpcode () == VPInstruction::ResumePhi &&
8998- " Expected a ResumePhi" );
8999- IVEndValues[WideIVR] = ResumePhi->getOperand (0 );
90009113 ScalarPhiIRI->addOperand (ResumePhi);
90019114 continue ;
90029115 }
@@ -9027,6 +9140,65 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
90279140 }
90289141}
90299142
9143+ // / Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
9144+ // / either an untruncated wide induction, or if it increments a wide induction
9145+ // / by its step.
9146+ static bool isOptimizableIVOrUse (VPValue *VPV) {
9147+ VPRecipeBase *Def = VPV->getDefiningRecipe ();
9148+ if (!Def)
9149+ return false ;
9150+ auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
9151+ if (WideIV) {
9152+ // VPV itself is a wide induction, separately compute the end value for exit
9153+ // users if it is not a truncated IV.
9154+ return isa<VPWidenPointerInductionRecipe>(WideIV) ||
9155+ !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst ();
9156+ }
9157+
9158+ // Check if VPV is an optimizable induction increment.
9159+ if (Def->getNumOperands () != 2 )
9160+ return false ;
9161+ WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand (0 ));
9162+ if (!WideIV)
9163+ WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand (1 ));
9164+ if (!WideIV)
9165+ return false ;
9166+
9167+ using namespace VPlanPatternMatch ;
9168+ auto &ID = WideIV->getInductionDescriptor ();
9169+
9170+ // Check if VPV increments the induction by the induction step.
9171+ VPValue *IVStep = WideIV->getStepValue ();
9172+ switch (ID.getInductionOpcode ()) {
9173+ case Instruction::Add:
9174+ return match (VPV, m_c_Binary<Instruction::Add>(m_Specific (WideIV),
9175+ m_Specific (IVStep)));
9176+ case Instruction::FAdd:
9177+ return match (VPV, m_c_Binary<Instruction::FAdd>(m_Specific (WideIV),
9178+ m_Specific (IVStep)));
9179+ case Instruction::FSub:
9180+ return match (VPV, m_Binary<Instruction::FSub>(m_Specific (WideIV),
9181+ m_Specific (IVStep)));
9182+ case Instruction::Sub: {
9183+ // IVStep will be the negated step of the subtraction. Check if Step == -1 *
9184+ // IVStep.
9185+ VPValue *Step;
9186+ if (!match (VPV, m_Binary<Instruction::Sub>(m_VPValue (), m_VPValue (Step))) ||
9187+ !Step->isLiveIn () || !IVStep->isLiveIn ())
9188+ return false ;
9189+ auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue ());
9190+ auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue ());
9191+ return StepCI && IVStepCI &&
9192+ StepCI->getValue () == (-1 * IVStepCI->getValue ());
9193+ }
9194+ default :
9195+ return ID.getKind () == InductionDescriptor::IK_PtrInduction &&
9196+ match (VPV, m_GetElementPtr (m_Specific (WideIV),
9197+ m_Specific (WideIV->getStepValue ())));
9198+ }
9199+ llvm_unreachable (" should have been covered by switch above" );
9200+ }
9201+
90309202// Collect VPIRInstructions for phis in the exit blocks that are modeled
90319203// in VPlan and add the exiting VPValue as operand. Some exiting values are not
90329204// modeled explicitly yet and won't be included. Those are un-truncated
@@ -9056,6 +9228,12 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
90569228 }
90579229 Value *IncomingValue = ExitPhi->getIncomingValueForBlock (ExitingBB);
90589230 VPValue *V = Builder.getVPValueOrAddLiveIn (IncomingValue);
9231+ // Exit values for inductions are computed and updated outside of VPlan
9232+ // and independent of induction recipes.
9233+ // TODO: Compute induction exit values in VPlan.
9234+ if (isOptimizableIVOrUse (V) &&
9235+ ExitVPBB->getSinglePredecessor () == MiddleVPBB)
9236+ continue ;
90599237 ExitUsersToFix.insert (ExitIRI);
90609238 ExitIRI->addOperand (V);
90619239 }
@@ -9075,7 +9253,6 @@ addUsersInExitBlocks(VPlan &Plan,
90759253
90769254 auto *MiddleVPBB = Plan.getMiddleBlock ();
90779255 VPBuilder B (MiddleVPBB, MiddleVPBB->getFirstNonPhi ());
9078- VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
90799256
90809257 // Introduce extract for exiting values and update the VPIRInstructions
90819258 // modeling the corresponding LCSSA phis.
@@ -9397,8 +9574,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93979574 VPlanTransforms::handleUncountableEarlyExit (
93989575 *Plan, *PSE.getSE (), OrigLoop, UncountableExitingBlock, RecipeBuilder);
93999576 }
9400- DenseMap<VPValue *, VPValue *> IVEndValues;
9401- addScalarResumePhis (RecipeBuilder, *Plan, IVEndValues);
9577+ addScalarResumePhis (RecipeBuilder, *Plan);
94029578 SetVector<VPIRInstruction *> ExitUsersToFix =
94039579 collectUsersInExitBlocks (OrigLoop, RecipeBuilder, *Plan);
94049580 addExitUsersForFirstOrderRecurrences (*Plan, ExitUsersToFix);
@@ -9481,7 +9657,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
94819657 VPlanTransforms::addActiveLaneMask (*Plan, ForControlFlow,
94829658 WithoutRuntimeCheck);
94839659 }
9484- VPlanTransforms::optimizeInductionExitUsers (*Plan, IVEndValues);
94859660
94869661 assert (verifyVPlanIsValid (*Plan) && " VPlan is invalid" );
94879662 return Plan;
@@ -9533,10 +9708,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
95339708 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
95349709 RecipeBuilder.setRecipe (HeaderR->getUnderlyingInstr (), HeaderR);
95359710 }
9536- DenseMap<VPValue *, VPValue *> IVEndValues;
9537- // TODO: IVEndValues are not used yet in the native path, to optimize exit
9538- // values.
9539- addScalarResumePhis (RecipeBuilder, *Plan, IVEndValues);
9711+ addScalarResumePhis (RecipeBuilder, *Plan);
95409712
95419713 assert (verifyVPlanIsValid (*Plan) && " VPlan is invalid" );
95429714 return Plan;
0 commit comments