@@ -8527,9 +8527,11 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
85278527 {CanonicalIVIncrement, &Plan.getVectorTripCount ()}, DL);
85288528}
85298529
8530- // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8531- // original exit block.
8532- static void addUsersInExitBlock (
8530+ // Collect (ExitPhi, ExitingValue) pairs phis in the original exit block that
8531+ // are modeled in VPlan. Some exiting values are not modeled explicitly yet and
8532+ // won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe,
8533+ // VPWidenPointerInductionRecipe and induction increments.
8534+ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock (
85338535 Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
85348536 const MapVector<PHINode *, InductionDescriptor> &Inductions) {
85358537 auto MiddleVPBB =
@@ -8538,9 +8540,8 @@ static void addUsersInExitBlock(
85388540 // and there is nothing to fix from vector loop; phis should have incoming
85398541 // from scalar loop only.
85408542 if (MiddleVPBB->getNumSuccessors () != 2 )
8541- return ;
8542-
8543- // Introduce VPUsers modeling the exit values.
8543+ return {};
8544+ MapVector<PHINode *, VPValue *> ExitingValuesToFix;
85448545 BasicBlock *ExitBB =
85458546 cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[0 ])->getIRBasicBlock ();
85468547 BasicBlock *ExitingBB = OrigLoop->getExitingBlock ();
@@ -8561,15 +8562,52 @@ static void addUsersInExitBlock(
85618562 return P && Inductions.contains (P);
85628563 })))
85638564 continue ;
8564- Plan. addLiveOut ( &ExitPhi, V);
8565+ ExitingValuesToFix. insert ({ &ExitPhi, V} );
85658566 }
8567+ return ExitingValuesToFix;
85668568}
85678569
8568- // / Feed a resume value for every FOR from the vector loop to the scalar loop,
8569- // / if middle block branches to scalar preheader, by introducing ExtractFromEnd
8570- // / and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the
8571- // / latter and corresponds to the scalar header.
8572- static void addLiveOutsForFirstOrderRecurrences (VPlan &Plan) {
8570+ // Add exit values to \p Plan. Extracts and VPLiveOuts are added for each entry
8571+ // in \p ExitingValuesToFix.
8572+ static void
8573+ addUsersInExitBlock (VPlan &Plan,
8574+ MapVector<PHINode *, VPValue *> &ExitingValuesToFix) {
8575+ if (ExitingValuesToFix.empty ())
8576+ return ;
8577+
8578+ auto MiddleVPBB =
8579+ cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
8580+ BasicBlock *ExitBB =
8581+ cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[0 ])->getIRBasicBlock ();
8582+ // TODO: set B to MiddleVPBB->getFirstNonPhi(), taking care of affected tests.
8583+ VPBuilder B (MiddleVPBB);
8584+ if (auto *Terminator = MiddleVPBB->getTerminator ()) {
8585+ auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand (0 ));
8586+ assert ((!Condition || Condition->getParent () == MiddleVPBB) &&
8587+ " Condition expected in MiddleVPBB" );
8588+ B.setInsertPoint (Condition ? Condition : Terminator);
8589+ }
8590+
8591+ // Introduce VPUsers modeling the exit values.
8592+ for (const auto &[ExitPhi, V] : ExitingValuesToFix) {
8593+ VPValue *Ext = B.createNaryOp (
8594+ VPInstruction::ExtractFromEnd,
8595+ {V, Plan.getOrAddLiveIn (ConstantInt::get (
8596+ IntegerType::get (ExitBB->getContext (), 32 ), 1 ))});
8597+ Plan.addLiveOut (ExitPhi, Ext);
8598+ }
8599+ }
8600+
8601+ // / Handle live-outs for first order reductions, both in the scalar preheader
8602+ // / and the original exit block:
8603+ // / 1. Feed a resume value for every FOR from the vector loop to the scalar
8604+ // / loop, if middle block branches to scalar preheader, by introducing
8605+ // / ExtractFromEnd and ResumePhi recipes in each, respectively, and a
8606+ // / VPLiveOut which uses the latter and corresponds to the scalar header.
8607+ // / 2. Feed the penultimate value of recurrences to their LCSSA phi users in
8608+ // / the original exit block using a VPLiveOut.
8609+ static void addLiveOutsForFirstOrderRecurrences (
8610+ VPlan &Plan, MapVector<PHINode *, VPValue *> &ExitingValuesToFix) {
85738611 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion ();
85748612
85758613 // Start by finding out if middle block branches to scalar preheader, which is
@@ -8578,21 +8616,31 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
85788616 // TODO: Should be replaced by
85798617 // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
85808618 // scalar region is modeled as well.
8581- VPBasicBlock *ScalarPHVPBB = nullptr ;
85828619 auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor ());
8583- for (VPBlockBase *Succ : MiddleVPBB->getSuccessors ()) {
8584- if (isa<VPIRBasicBlock>(Succ))
8585- continue ;
8586- assert (!ScalarPHVPBB && " Two candidates for ScalarPHVPBB?" );
8587- ScalarPHVPBB = cast<VPBasicBlock>(Succ);
8620+ BasicBlock *ExitBB = nullptr ;
8621+ VPBasicBlock *ScalarPHVPBB = nullptr ;
8622+ if (MiddleVPBB->getNumSuccessors () == 2 ) {
8623+ // Order is strict: first is the exit block, second is the scalar preheader.
8624+ ExitBB =
8625+ cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[0 ])->getIRBasicBlock ();
8626+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors ()[1 ]);
8627+ } else if (ExitingValuesToFix.empty ()) {
8628+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor ());
8629+ } else {
8630+ ExitBB = cast<VPIRBasicBlock>(MiddleVPBB->getSingleSuccessor ())
8631+ ->getIRBasicBlock ();
85888632 }
8589- if (!ScalarPHVPBB)
8633+ if (!ScalarPHVPBB) {
8634+ assert (ExitingValuesToFix.empty () &&
8635+ " missed inserting extracts for exiting values" );
85908636 return ;
8637+ }
85918638
85928639 VPBuilder ScalarPHBuilder (ScalarPHVPBB);
85938640 VPBuilder MiddleBuilder (MiddleVPBB);
85948641 // Reset insert point so new recipes are inserted before terminator and
85958642 // condition, if there is either the former or both.
8643+ // TODO: set MiddleBuilder to MiddleVPBB->getFirstNonPhi().
85968644 if (auto *Terminator = MiddleVPBB->getTerminator ()) {
85978645 auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand (0 ));
85988646 assert ((!Condition || Condition->getParent () == MiddleVPBB) &&
@@ -8601,20 +8649,110 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
86018649 }
86028650 VPValue *OneVPV = Plan.getOrAddLiveIn (
86038651 ConstantInt::get (Plan.getCanonicalIV ()->getScalarType (), 1 ));
8652+ VPValue *TwoVPV = Plan.getOrAddLiveIn (
8653+ ConstantInt::get (Plan.getCanonicalIV ()->getScalarType (), 2 ));
86048654
86058655 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock ()->phis ()) {
86068656 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
86078657 if (!FOR)
86088658 continue ;
86098659
8660+ // This is the second phase of vectorizing first-order recurrences, creating
8661+ // extract for users outside the loop. An overview of the transformation is
8662+ // described below. Suppose we have the following loop with some use after
8663+ // the loop of the last a[i-1],
8664+ //
8665+ // for (int i = 0; i < n; ++i) {
8666+ // t = a[i - 1];
8667+ // b[i] = a[i] - t;
8668+ // }
8669+ // use t;
8670+ //
8671+ // There is a first-order recurrence on "a". For this loop, the shorthand
8672+ // scalar IR looks like:
8673+ //
8674+ // scalar.ph:
8675+ // s.init = a[-1]
8676+ // br scalar.body
8677+ //
8678+ // scalar.body:
8679+ // i = phi [0, scalar.ph], [i+1, scalar.body]
8680+ // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
8681+ // s2 = a[i]
8682+ // b[i] = s2 - s1
8683+ // br cond, scalar.body, exit.block
8684+ //
8685+ // exit.block:
8686+ // use = lcssa.phi [s1, scalar.body]
8687+ //
8688+ // In this example, s1 is a recurrence because it's value depends on the
8689+ // previous iteration. In the first phase of vectorization, we created a
8690+ // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
8691+ // for users in the scalar preheader and exit block.
8692+ //
8693+ // vector.ph:
8694+ // v_init = vector(..., ..., ..., a[-1])
8695+ // br vector.body
8696+ //
8697+ // vector.body
8698+ // i = phi [0, vector.ph], [i+4, vector.body]
8699+ // v1 = phi [v_init, vector.ph], [v2, vector.body]
8700+ // v2 = a[i, i+1, i+2, i+3]
8701+ // b[i] = v2 - v1
8702+ // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
8703+ // b[i, i+1, i+2, i+3] = v2 - v1
8704+ // br cond, vector.body, middle.block
8705+ //
8706+ // middle.block:
8707+ // vector.recur.extract.for.phi = v2(2)
8708+ // vector.recur.extract = v2(3)
8709+ // br cond, scalar.ph, exit.block
8710+ //
8711+ // scalar.ph:
8712+ // scalar.recur.init = phi [vector.recur.extract, middle.block],
8713+ // [s.init, otherwise]
8714+ // br scalar.body
8715+ //
8716+ // scalar.body:
8717+ // i = phi [0, scalar.ph], [i+1, scalar.body]
8718+ // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
8719+ // s2 = a[i]
8720+ // b[i] = s2 - s1
8721+ // br cond, scalar.body, exit.block
8722+ //
8723+ // exit.block:
8724+ // lo = lcssa.phi [s1, scalar.body],
8725+ // [vector.recur.extract.for.phi, middle.block]
8726+ //
86108727 // Extract the resume value and create a new VPLiveOut for it.
86118728 auto *Resume = MiddleBuilder.createNaryOp (VPInstruction::ExtractFromEnd,
86128729 {FOR->getBackedgeValue (), OneVPV},
86138730 {}, " vector.recur.extract" );
86148731 auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp (
86158732 VPInstruction::ResumePhi, {Resume, FOR->getStartValue ()}, {},
86168733 " scalar.recur.init" );
8617- Plan.addLiveOut (cast<PHINode>(FOR->getUnderlyingInstr ()), ResumePhiRecipe);
8734+ auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr ());
8735+ Plan.addLiveOut (FORPhi, ResumePhiRecipe);
8736+
8737+ // Now create VPLiveOuts for users in the exit block.
8738+ // Extract the penultimate value of the recurrence and add VPLiveOut
8739+ // users of the recurrence splice.
8740+
8741+ // No edge from the middle block to the unique exit block has been inserted
8742+ // and there is nothing to fix from vector loop; phis should have incoming
8743+ // from scalar loop only.
8744+ if (ExitingValuesToFix.empty ())
8745+ continue ;
8746+ for (User *U : FORPhi->users ()) {
8747+ auto *UI = cast<Instruction>(U);
8748+ if (UI->getParent () != ExitBB)
8749+ continue ;
8750+ VPValue *Ext = MiddleBuilder.createNaryOp (
8751+ VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue (), TwoVPV}, {},
8752+ " vector.recur.extract.for.phi" );
8753+ Plan.addLiveOut (cast<PHINode>(UI), Ext);
8754+ ExitingValuesToFix.erase (cast<PHINode>(UI));
8755+ }
86188756 }
86198757}
86208758
@@ -8769,16 +8907,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
87698907 // After here, VPBB should not be used.
87708908 VPBB = nullptr ;
87718909
8772- addUsersInExitBlock (OrigLoop, RecipeBuilder, *Plan,
8773- Legal->getInductionVars ());
8774-
87758910 assert (isa<VPRegionBlock>(Plan->getVectorLoopRegion ()) &&
87768911 !Plan->getVectorLoopRegion ()->getEntryBasicBlock ()->empty () &&
87778912 " entry block must be set to a VPRegionBlock having a non-empty entry "
87788913 " VPBasicBlock" );
87798914 RecipeBuilder.fixHeaderPhis ();
87808915
8781- addLiveOutsForFirstOrderRecurrences (*Plan);
8916+ MapVector<PHINode *, VPValue *> ExitingValuesToFix = collectUsersInExitBlock (
8917+ OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
8918+
8919+ addLiveOutsForFirstOrderRecurrences (*Plan, ExitingValuesToFix);
8920+ addUsersInExitBlock (*Plan, ExitingValuesToFix);
87828921
87838922 // ---------------------------------------------------------------------------
87848923 // Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -8931,6 +9070,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
89319070// iteration. The final value is selected by the final ComputeReductionResult.
89329071void LoopVectorizationPlanner::adjustRecipesForReductions (
89339072 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9073+ using namespace VPlanPatternMatch ;
89349074 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion ();
89359075 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock ();
89369076 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
@@ -8988,10 +9128,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
89889128 for (unsigned I = 0 ; I != Worklist.size (); ++I) {
89899129 VPSingleDefRecipe *Cur = Worklist[I];
89909130 for (VPUser *U : Cur->users ()) {
8991- auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8992- if (!UserRecipe) {
8993- assert (isa<VPLiveOut>(U) &&
8994- " U must either be a VPSingleDef or VPLiveOut" );
9131+ auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9132+ if (!UserRecipe->getParent ()->getEnclosingLoopRegion ()) {
9133+ assert (match (U, m_Binary<VPInstruction::ExtractFromEnd>(
9134+ m_VPValue (), m_VPValue ())) &&
9135+ " U must be an ExtractFromEnd VPInstruction" );
89959136 continue ;
89969137 }
89979138 Worklist.insert (UserRecipe);
@@ -9208,9 +9349,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
92089349 auto *FinalReductionResult = new VPInstruction (
92099350 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
92109351 FinalReductionResult->insertBefore (*MiddleVPBB, IP);
9211- OrigExitingVPV->replaceUsesWithIf (
9212- FinalReductionResult,
9213- [](VPUser &User, unsigned ) { return isa<VPLiveOut>(&User); });
9352+ OrigExitingVPV->replaceUsesWithIf (FinalReductionResult, [](VPUser &User,
9353+ unsigned ) {
9354+ return match (&User, m_Binary<VPInstruction::ExtractFromEnd>(m_VPValue (),
9355+ m_VPValue ()));
9356+ });
92149357 }
92159358
92169359 VPlanTransforms::clearReductionWrapFlags (*Plan);
0 commit comments