@@ -467,11 +467,12 @@ class InnerLoopVectorizer {
467467 ElementCount MinProfitableTripCount,
468468 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
469469 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
470- ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
470+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
471+ VPlan &Plan)
471472 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
472473 AC (AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
473474 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
474- PSI(PSI), RTChecks(RTChecks) {
475+ PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
475476 // Query this against the original loop and save it here because the profile
476477 // of the original loop header may change as the transformation happens.
477478 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize (
@@ -522,7 +523,7 @@ class InnerLoopVectorizer {
522523 // / and the resume values can come from an additional bypass block, the \p
523524 // / AdditionalBypass pair provides information about the bypass block and the
524525 // / end value on the edge from bypass to this loop.
525- PHINode * createInductionResumeValue (
526+ void createInductionResumeValue (
526527 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
527528 ArrayRef<BasicBlock *> BypassBlocks,
528529 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr , nullptr });
@@ -535,6 +536,11 @@ class InnerLoopVectorizer {
535536 // / count of the original loop for both main loop and epilogue vectorization.
536537 void setTripCount (Value *TC) { TripCount = TC; }
537538
539+ std::pair<BasicBlock *, Value *>
540+ getInductionBypassValue (PHINode *OrigPhi) const {
541+ return InductionBypassValues.find (OrigPhi)->second ;
542+ }
543+
538544protected:
539545 friend class LoopVectorizationPlanner ;
540546
@@ -680,6 +686,11 @@ class InnerLoopVectorizer {
680686 // / Structure to hold information about generated runtime checks, responsible
681687 // / for cleaning the checks, if vectorization turns out unprofitable.
682688 GeneratedRTChecks &RTChecks;
689+
690+ // / Mapping of induction phis to their bypass values and bypass blocks.
691+ DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues;
692+
693+ VPlan &Plan;
683694};
684695
685696// / Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -721,10 +732,10 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
721732 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
722733 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
723734 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
724- GeneratedRTChecks &Checks)
735+ GeneratedRTChecks &Checks, VPlan &Plan )
725736 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
726737 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
727- CM, BFI, PSI, Checks),
738+ CM, BFI, PSI, Checks, Plan ),
728739 EPI (EPI) {}
729740
730741 // Override this function to handle the more complex control flow around the
@@ -761,9 +772,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
761772 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
762773 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
763774 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
764- GeneratedRTChecks &Check)
775+ GeneratedRTChecks &Check, VPlan &Plan )
765776 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
766- EPI, LVL, CM, BFI, PSI, Check) {}
777+ EPI, LVL, CM, BFI, PSI, Check, Plan ) {}
767778 // / Implements the interface for creating a vectorized skeleton using the
768779 // / *main loop* strategy (ie the first pass of vplan execution).
769780 std::pair<BasicBlock *, Value *>
@@ -790,9 +801,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
790801 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
791802 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
792803 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
793- GeneratedRTChecks &Checks)
804+ GeneratedRTChecks &Checks, VPlan &Plan )
794805 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
795- EPI, LVL, CM, BFI, PSI, Checks) {
806+ EPI, LVL, CM, BFI, PSI, Checks, Plan ) {
796807 TripCount = EPI.TripCount ;
797808 }
798809 // / Implements the interface for creating a vectorized skeleton using the
@@ -2555,7 +2566,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
25552566 nullptr , Twine (Prefix) + " scalar.ph" );
25562567}
25572568
2558- PHINode *InnerLoopVectorizer::createInductionResumeValue (
2569+ static void addOperandToPhiInVPIRBasicBlock (VPIRBasicBlock *VPBB, PHINode *P,
2570+ VPValue *Op) {
2571+ for (VPRecipeBase &R : *VPBB) {
2572+ auto *IRI = cast<VPIRInstruction>(&R);
2573+ if (&IRI->getInstruction () == P) {
2574+ IRI->addOperand (Op);
2575+ break ;
2576+ }
2577+ }
2578+ }
2579+
2580+ void InnerLoopVectorizer::createInductionResumeValue (
25592581 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
25602582 ArrayRef<BasicBlock *> BypassBlocks,
25612583 std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -2590,27 +2612,28 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
25902612 }
25912613 }
25922614
2593- // Create phi nodes to merge from the backedge-taken check block.
2594- PHINode *BCResumeVal =
2595- PHINode::Create (OrigPhi->getType (), 3 , " bc.resume.val" ,
2596- LoopScalarPreHeader->getFirstNonPHIIt ());
2597- // Copy original phi DL over to the new one.
2598- BCResumeVal->setDebugLoc (OrigPhi->getDebugLoc ());
2615+ VPBasicBlock *MiddleVPBB =
2616+ cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
25992617
2600- // The new PHI merges the original incoming value, in case of a bypass,
2601- // or the value at the end of the vectorized loop.
2602- BCResumeVal->addIncoming (EndValue, LoopMiddleBlock);
2618+ VPBasicBlock *ScalarPHVPBB = nullptr ;
2619+ if (MiddleVPBB->getNumSuccessors () == 2 ) {
2620+ // Order is strict: first is the exit block, second is the scalar preheader.
2621+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors ()[1 ]);
2622+ } else {
2623+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor ());
2624+ }
26032625
2604- // Fix the scalar body counter (PHI node).
2605- // The old induction's phi node in the scalar body needs the truncated
2606- // value.
2607- for (BasicBlock *BB : BypassBlocks)
2608- BCResumeVal-> addIncoming (II. getStartValue ( ), BB );
2626+ VPBuilder ScalarPHBuilder (ScalarPHVPBB);
2627+ auto *ResumePhiRecipe = ScalarPHBuilder. createNaryOp (
2628+ VPInstruction::ResumePhi,
2629+ {Plan. getOrAddLiveIn (EndValue), Plan. getOrAddLiveIn (II. getStartValue ())},
2630+ OrigPhi-> getDebugLoc ( ), " bc.resume.val " );
26092631
2610- if (AdditionalBypass.first )
2611- BCResumeVal->setIncomingValueForBlock (AdditionalBypass.first ,
2612- EndValueFromAdditionalBypass);
2613- return BCResumeVal;
2632+ auto *ScalarLoopHeader =
2633+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ());
2634+ addOperandToPhiInVPIRBasicBlock (ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
2635+ InductionBypassValues[OrigPhi] = {AdditionalBypass.first ,
2636+ EndValueFromAdditionalBypass};
26142637}
26152638
26162639// / Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2643,10 +2666,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
26432666 for (const auto &InductionEntry : Legal->getInductionVars ()) {
26442667 PHINode *OrigPhi = InductionEntry.first ;
26452668 const InductionDescriptor &II = InductionEntry.second ;
2646- PHINode *BCResumeVal = createInductionResumeValue (
2647- OrigPhi, II, getExpandedStep (II, ExpandedSCEVs), LoopBypassBlocks,
2648- AdditionalBypass);
2649- OrigPhi->setIncomingValueForBlock (LoopScalarPreHeader, BCResumeVal);
2669+ createInductionResumeValue (OrigPhi, II, getExpandedStep (II, ExpandedSCEVs),
2670+ LoopBypassBlocks, AdditionalBypass);
26502671 }
26512672}
26522673
@@ -7688,6 +7709,25 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
76887709 // the second pass for the scalar loop. The induction resume values for the
76897710 // inductions in the epilogue loop are created before executing the plan for
76907711 // the epilogue loop.
7712+ for (VPRecipeBase &R :
7713+ Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
7714+ // Create induction resume values for both widened pointer and
7715+ // integer/fp inductions and update the start value of the induction
7716+ // recipes to use the resume value.
7717+ PHINode *IndPhi = nullptr ;
7718+ const InductionDescriptor *ID;
7719+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
7720+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue ());
7721+ ID = &Ind->getInductionDescriptor ();
7722+ } else if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
7723+ IndPhi = WidenInd->getPHINode ();
7724+ ID = &WidenInd->getInductionDescriptor ();
7725+ } else
7726+ continue ;
7727+
7728+ createInductionResumeValue (IndPhi, *ID, getExpandedStep (*ID, ExpandedSCEVs),
7729+ LoopBypassBlocks);
7730+ }
76917731
76927732 return {LoopVectorPreHeader, nullptr };
76937733}
@@ -8865,14 +8905,9 @@ static void addLiveOutsForFirstOrderRecurrences(
88658905 VPInstruction::ResumePhi, {Resume, FOR->getStartValue ()}, {},
88668906 " scalar.recur.init" );
88678907 auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr ());
8868- for (VPRecipeBase &R :
8869- *cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ())) {
8870- auto *IRI = cast<VPIRInstruction>(&R);
8871- if (&IRI->getInstruction () == FORPhi) {
8872- IRI->addOperand (ResumePhiRecipe);
8873- break ;
8874- }
8875- }
8908+ addOperandToPhiInVPIRBasicBlock (
8909+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ()), FORPhi,
8910+ ResumePhiRecipe);
88768911
88778912 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
88788913 // Extract the penultimate value of the recurrence and use it as operand for
@@ -9599,7 +9634,7 @@ static bool processLoopInVPlanNativePath(
95999634 GeneratedRTChecks Checks (*PSE.getSE (), DT, LI, TTI,
96009635 F->getDataLayout (), AddBranchWeights);
96019636 InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
9602- VF.Width , 1 , LVL, &CM, BFI, PSI, Checks);
9637+ VF.Width , 1 , LVL, &CM, BFI, PSI, Checks, BestPlan );
96039638 LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
96049639 << L->getHeader ()->getParent ()->getName () << " \"\n " );
96059640 LVP.executePlan (VF.Width , 1 , BestPlan, LB, DT, false );
@@ -10087,11 +10122,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1008710122 assert (IC > 1 && " interleave count should not be 1 or 0" );
1008810123 // If we decided that it is not legal to vectorize the loop, then
1008910124 // interleave it.
10125+ VPlan &BestPlan = LVP.getPlanFor (VF.Width );
1009010126 InnerLoopVectorizer Unroller (
1009110127 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed (1 ),
10092- ElementCount::getFixed (1 ), IC, &LVL, &CM, BFI, PSI, Checks);
10128+ ElementCount::getFixed (1 ), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan );
1009310129
10094- VPlan &BestPlan = LVP.getPlanFor (VF.Width );
1009510130 LVP.executePlan (VF.Width , IC, BestPlan, Unroller, DT, false );
1009610131
1009710132 ORE->emit ([&]() {
@@ -10113,10 +10148,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1011310148 // to be vectorized by executing the plan (potentially with a different
1011410149 // factor) again shortly afterwards.
1011510150 EpilogueLoopVectorizationInfo EPI (VF.Width , IC, EpilogueVF.Width , 1 );
10151+ std::unique_ptr<VPlan> BestMainPlan (BestPlan.duplicate ());
1011610152 EpilogueVectorizerMainLoop MainILV (L, PSE, LI, DT, TLI, TTI, AC, ORE,
10117- EPI, &LVL, &CM, BFI, PSI, Checks);
10153+ EPI, &LVL, &CM, BFI, PSI, Checks,
10154+ *BestMainPlan);
1011810155
10119- std::unique_ptr<VPlan> BestMainPlan (BestPlan.duplicate ());
1012010156 auto ExpandedSCEVs = LVP.executePlan (EPI.MainLoopVF , EPI.MainLoopUF ,
1012110157 *BestMainPlan, MainILV, DT, true );
1012210158 ++LoopsVectorized;
@@ -10125,11 +10161,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1012510161 // edges from the first pass.
1012610162 EPI.MainLoopVF = EPI.EpilogueVF ;
1012710163 EPI.MainLoopUF = EPI.EpilogueUF ;
10164+ VPlan &BestEpiPlan = LVP.getPlanFor (EPI.EpilogueVF );
1012810165 EpilogueVectorizerEpilogueLoop EpilogILV (L, PSE, LI, DT, TLI, TTI, AC,
1012910166 ORE, EPI, &LVL, &CM, BFI, PSI,
10130- Checks);
10167+ Checks, BestEpiPlan );
1013110168
10132- VPlan &BestEpiPlan = LVP.getPlanFor (EPI.EpilogueVF );
1013310169 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion ();
1013410170 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock ();
1013510171 Header->setName (" vec.epilog.vector.body" );
@@ -10178,23 +10214,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1017810214 RdxDesc.getRecurrenceStartValue ());
1017910215 }
1018010216 } else {
10181- // Create induction resume values for both widened pointer and
10182- // integer/fp inductions and update the start value of the induction
10183- // recipes to use the resume value.
10217+ // Retrive the induction resume values for wide inductions from
10218+ // their original phi nodes in the scalar loop
1018410219 PHINode *IndPhi = nullptr ;
10185- const InductionDescriptor *ID;
1018610220 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
1018710221 IndPhi = cast<PHINode>(Ind->getUnderlyingValue ());
10188- ID = &Ind->getInductionDescriptor ();
1018910222 } else {
1019010223 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
1019110224 IndPhi = WidenInd->getPHINode ();
10192- ID = &WidenInd->getInductionDescriptor ();
1019310225 }
10194-
10195- ResumeV = MainILV.createInductionResumeValue (
10196- IndPhi, *ID, getExpandedStep (*ID, ExpandedSCEVs),
10197- {EPI.MainLoopIterationCountCheck });
10226+ ResumeV = IndPhi->getIncomingValueForBlock (L->getLoopPreheader ());
1019810227 }
1019910228 assert (ResumeV && " Must have a resume value" );
1020010229 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn (ResumeV);
@@ -10206,13 +10235,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1020610235 LVP.executePlan (EPI.EpilogueVF , EPI.EpilogueUF , BestEpiPlan, EpilogILV,
1020710236 DT, true , &ExpandedSCEVs);
1020810237 ++LoopsEpilogueVectorized;
10238+ BasicBlock *PH = L->getLoopPreheader ();
1020910239
10240+ for (const auto &[IVPhi, _] : LVL.getInductionVars ()) {
10241+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock (PH));
10242+ const auto &[BB, V] = EpilogILV.getInductionBypassValue (IVPhi);
10243+ Inc->setIncomingValueForBlock (BB, V);
10244+ }
1021010245 if (!MainILV.areSafetyChecksAdded ())
1021110246 DisableRuntimeUnroll = true ;
1021210247 } else {
1021310248 InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
1021410249 VF.MinProfitableTripCount , IC, &LVL, &CM, BFI,
10215- PSI, Checks);
10250+ PSI, Checks, BestPlan );
1021610251 LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
1021710252 ++LoopsVectorized;
1021810253
0 commit comments