@@ -467,11 +467,12 @@ class InnerLoopVectorizer {
467467 ElementCount MinProfitableTripCount,
468468 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
469469 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
470- ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
470+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
471+ VPlan &Plan)
471472 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
472473 AC (AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
473474 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
474- PSI(PSI), RTChecks(RTChecks) {
475+ PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
475476 // Query this against the original loop and save it here because the profile
476477 // of the original loop header may change as the transformation happens.
477478 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize (
@@ -522,7 +523,7 @@ class InnerLoopVectorizer {
522523 // / and the resume values can come from an additional bypass block, the \p
523524 // / AdditionalBypass pair provides information about the bypass block and the
524525 // / end value on the edge from bypass to this loop.
525- PHINode * createInductionResumeValue (
526+ void createInductionResumeValue (
526527 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
527528 ArrayRef<BasicBlock *> BypassBlocks,
528529 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr , nullptr });
@@ -535,6 +536,11 @@ class InnerLoopVectorizer {
535536 // / count of the original loop for both main loop and epilogue vectorization.
536537 void setTripCount (Value *TC) { TripCount = TC; }
537538
539+ std::pair<BasicBlock *, Value *>
540+ getInductionBypassValue (PHINode *OrigPhi) const {
541+ return InductionBypassValues.find (OrigPhi)->second ;
542+ }
543+
538544protected:
539545 friend class LoopVectorizationPlanner ;
540546
@@ -677,6 +683,11 @@ class InnerLoopVectorizer {
677683 // / Structure to hold information about generated runtime checks, responsible
678684 // / for cleaning the checks, if vectorization turns out unprofitable.
679685 GeneratedRTChecks &RTChecks;
686+
687+ // / Mapping of induction phis to their bypass values and bypass blocks.
688+ DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues;
689+
690+ VPlan &Plan;
680691};
681692
682693// / Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -718,10 +729,10 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
718729 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
719730 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
720731 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
721- GeneratedRTChecks &Checks)
732+ GeneratedRTChecks &Checks, VPlan &Plan )
722733 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
723734 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
724- CM, BFI, PSI, Checks),
735+ CM, BFI, PSI, Checks, Plan ),
725736 EPI (EPI) {}
726737
727738 // Override this function to handle the more complex control flow around the
@@ -758,9 +769,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
758769 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
759770 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
760771 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
761- GeneratedRTChecks &Check)
772+ GeneratedRTChecks &Check, VPlan &Plan )
762773 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
763- EPI, LVL, CM, BFI, PSI, Check) {}
774+ EPI, LVL, CM, BFI, PSI, Check, Plan ) {}
764775 // / Implements the interface for creating a vectorized skeleton using the
765776 // / *main loop* strategy (ie the first pass of vplan execution).
766777 std::pair<BasicBlock *, Value *>
@@ -787,9 +798,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
787798 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
788799 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
789800 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
790- GeneratedRTChecks &Checks)
801+ GeneratedRTChecks &Checks, VPlan &Plan )
791802 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
792- EPI, LVL, CM, BFI, PSI, Checks) {
803+ EPI, LVL, CM, BFI, PSI, Checks, Plan ) {
793804 TripCount = EPI.TripCount ;
794805 }
795806 // / Implements the interface for creating a vectorized skeleton using the
@@ -2546,7 +2557,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
25462557 nullptr , Twine (Prefix) + " scalar.ph" );
25472558}
25482559
2549- PHINode *InnerLoopVectorizer::createInductionResumeValue (
2560+ static void addOperandToPhiInVPIRBasicBlock (VPIRBasicBlock *VPBB, PHINode *P,
2561+ VPValue *Op) {
2562+ for (VPRecipeBase &R : *VPBB) {
2563+ auto *IRI = cast<VPIRInstruction>(&R);
2564+ if (&IRI->getInstruction () == P) {
2565+ IRI->addOperand (Op);
2566+ break ;
2567+ }
2568+ }
2569+ }
2570+
2571+ void InnerLoopVectorizer::createInductionResumeValue (
25502572 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
25512573 ArrayRef<BasicBlock *> BypassBlocks,
25522574 std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -2581,27 +2603,28 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
25812603 }
25822604 }
25832605
2584- // Create phi nodes to merge from the backedge-taken check block.
2585- PHINode *BCResumeVal =
2586- PHINode::Create (OrigPhi->getType (), 3 , " bc.resume.val" ,
2587- LoopScalarPreHeader->getFirstNonPHIIt ());
2588- // Copy original phi DL over to the new one.
2589- BCResumeVal->setDebugLoc (OrigPhi->getDebugLoc ());
2606+ VPBasicBlock *MiddleVPBB =
2607+ cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
25902608
2591- // The new PHI merges the original incoming value, in case of a bypass,
2592- // or the value at the end of the vectorized loop.
2593- BCResumeVal->addIncoming (EndValue, LoopMiddleBlock);
2609+ VPBasicBlock *ScalarPHVPBB = nullptr ;
2610+ if (MiddleVPBB->getNumSuccessors () == 2 ) {
2611+ // Order is strict: first is the exit block, second is the scalar preheader.
2612+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors ()[1 ]);
2613+ } else {
2614+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor ());
2615+ }
25942616
2595- // Fix the scalar body counter (PHI node).
2596- // The old induction's phi node in the scalar body needs the truncated
2597- // value.
2598- for (BasicBlock *BB : BypassBlocks)
2599- BCResumeVal-> addIncoming (II. getStartValue ( ), BB );
2617+ VPBuilder ScalarPHBuilder (ScalarPHVPBB);
2618+ auto *ResumePhiRecipe = ScalarPHBuilder. createNaryOp (
2619+ VPInstruction::ResumePhi,
2620+ {Plan. getOrAddLiveIn (EndValue), Plan. getOrAddLiveIn (II. getStartValue ())},
2621+ OrigPhi-> getDebugLoc ( ), " bc.resume.val " );
26002622
2601- if (AdditionalBypass.first )
2602- BCResumeVal->setIncomingValueForBlock (AdditionalBypass.first ,
2603- EndValueFromAdditionalBypass);
2604- return BCResumeVal;
2623+ auto *ScalarLoopHeader =
2624+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ());
2625+ addOperandToPhiInVPIRBasicBlock (ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
2626+ InductionBypassValues[OrigPhi] = {AdditionalBypass.first ,
2627+ EndValueFromAdditionalBypass};
26052628}
26062629
26072630// / Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2634,10 +2657,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
26342657 for (const auto &InductionEntry : Legal->getInductionVars ()) {
26352658 PHINode *OrigPhi = InductionEntry.first ;
26362659 const InductionDescriptor &II = InductionEntry.second ;
2637- PHINode *BCResumeVal = createInductionResumeValue (
2638- OrigPhi, II, getExpandedStep (II, ExpandedSCEVs), LoopBypassBlocks,
2639- AdditionalBypass);
2640- OrigPhi->setIncomingValueForBlock (LoopScalarPreHeader, BCResumeVal);
2660+ createInductionResumeValue (OrigPhi, II, getExpandedStep (II, ExpandedSCEVs),
2661+ LoopBypassBlocks, AdditionalBypass);
26412662 }
26422663}
26432664
@@ -7738,6 +7759,25 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
77387759 // the second pass for the scalar loop. The induction resume values for the
77397760 // inductions in the epilogue loop are created before executing the plan for
77407761 // the epilogue loop.
7762+ for (VPRecipeBase &R :
7763+ Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
7764+ // Create induction resume values for both widened pointer and
7765+ // integer/fp inductions and update the start value of the induction
7766+ // recipes to use the resume value.
7767+ PHINode *IndPhi = nullptr ;
7768+ const InductionDescriptor *ID;
7769+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
7770+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue ());
7771+ ID = &Ind->getInductionDescriptor ();
7772+ } else if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
7773+ IndPhi = WidenInd->getPHINode ();
7774+ ID = &WidenInd->getInductionDescriptor ();
7775+ } else
7776+ continue ;
7777+
7778+ createInductionResumeValue (IndPhi, *ID, getExpandedStep (*ID, ExpandedSCEVs),
7779+ LoopBypassBlocks);
7780+ }
77417781
77427782 return {LoopVectorPreHeader, nullptr };
77437783}
@@ -8911,14 +8951,9 @@ static void addLiveOutsForFirstOrderRecurrences(
89118951 VPInstruction::ResumePhi, {Resume, FOR->getStartValue ()}, {},
89128952 " scalar.recur.init" );
89138953 auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr ());
8914- for (VPRecipeBase &R :
8915- *cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ())) {
8916- auto *IRI = cast<VPIRInstruction>(&R);
8917- if (&IRI->getInstruction () == FORPhi) {
8918- IRI->addOperand (ResumePhiRecipe);
8919- break ;
8920- }
8921- }
8954+ addOperandToPhiInVPIRBasicBlock (
8955+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ()), FORPhi,
8956+ ResumePhiRecipe);
89228957
89238958 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
89248959 // Extract the penultimate value of the recurrence and use it as operand for
@@ -9645,7 +9680,7 @@ static bool processLoopInVPlanNativePath(
96459680 GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
96469681 AddBranchWeights);
96479682 InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
9648- VF.Width , 1 , LVL, &CM, BFI, PSI, Checks);
9683+ VF.Width , 1 , LVL, &CM, BFI, PSI, Checks, BestPlan );
96499684 LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
96509685 << L->getHeader ()->getParent ()->getName () << " \"\n " );
96519686 LVP.executePlan (VF.Width , 1 , BestPlan, LB, DT, false );
@@ -10133,11 +10168,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1013310168 assert (IC > 1 && " interleave count should not be 1 or 0" );
1013410169 // If we decided that it is not legal to vectorize the loop, then
1013510170 // interleave it.
10171+ VPlan &BestPlan = LVP.getPlanFor (VF.Width );
1013610172 InnerLoopVectorizer Unroller (
1013710173 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed (1 ),
10138- ElementCount::getFixed (1 ), IC, &LVL, &CM, BFI, PSI, Checks);
10174+ ElementCount::getFixed (1 ), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan );
1013910175
10140- VPlan &BestPlan = LVP.getPlanFor (VF.Width );
1014110176 LVP.executePlan (VF.Width , IC, BestPlan, Unroller, DT, false );
1014210177
1014310178 ORE->emit ([&]() {
@@ -10159,10 +10194,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1015910194 // to be vectorized by executing the plan (potentially with a different
1016010195 // factor) again shortly afterwards.
1016110196 EpilogueLoopVectorizationInfo EPI (VF.Width , IC, EpilogueVF.Width , 1 );
10197+ std::unique_ptr<VPlan> BestMainPlan (BestPlan.duplicate ());
1016210198 EpilogueVectorizerMainLoop MainILV (L, PSE, LI, DT, TLI, TTI, AC, ORE,
10163- EPI, &LVL, &CM, BFI, PSI, Checks);
10199+ EPI, &LVL, &CM, BFI, PSI, Checks,
10200+ *BestMainPlan);
1016410201
10165- std::unique_ptr<VPlan> BestMainPlan (BestPlan.duplicate ());
1016610202 auto ExpandedSCEVs = LVP.executePlan (EPI.MainLoopVF , EPI.MainLoopUF ,
1016710203 *BestMainPlan, MainILV, DT, true );
1016810204 ++LoopsVectorized;
@@ -10171,11 +10207,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1017110207 // edges from the first pass.
1017210208 EPI.MainLoopVF = EPI.EpilogueVF ;
1017310209 EPI.MainLoopUF = EPI.EpilogueUF ;
10210+ VPlan &BestEpiPlan = LVP.getPlanFor (EPI.EpilogueVF );
1017410211 EpilogueVectorizerEpilogueLoop EpilogILV (L, PSE, LI, DT, TLI, TTI, AC,
1017510212 ORE, EPI, &LVL, &CM, BFI, PSI,
10176- Checks);
10213+ Checks, BestEpiPlan );
1017710214
10178- VPlan &BestEpiPlan = LVP.getPlanFor (EPI.EpilogueVF );
1017910215 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion ();
1018010216 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock ();
1018110217 Header->setName (" vec.epilog.vector.body" );
@@ -10224,23 +10260,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1022410260 RdxDesc.getRecurrenceStartValue ());
1022510261 }
1022610262 } else {
10227- // Create induction resume values for both widened pointer and
10228- // integer/fp inductions and update the start value of the induction
10229- // recipes to use the resume value.
10263+ // Retrive the induction resume values for wide inductions from
10264+ // their original phi nodes in the scalar loop
1023010265 PHINode *IndPhi = nullptr ;
10231- const InductionDescriptor *ID;
1023210266 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
1023310267 IndPhi = cast<PHINode>(Ind->getUnderlyingValue ());
10234- ID = &Ind->getInductionDescriptor ();
1023510268 } else {
1023610269 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
1023710270 IndPhi = WidenInd->getPHINode ();
10238- ID = &WidenInd->getInductionDescriptor ();
1023910271 }
10240-
10241- ResumeV = MainILV.createInductionResumeValue (
10242- IndPhi, *ID, getExpandedStep (*ID, ExpandedSCEVs),
10243- {EPI.MainLoopIterationCountCheck });
10272+ ResumeV = IndPhi->getIncomingValueForBlock (L->getLoopPreheader ());
1024410273 }
1024510274 assert (ResumeV && " Must have a resume value" );
1024610275 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn (ResumeV);
@@ -10252,13 +10281,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1025210281 LVP.executePlan (EPI.EpilogueVF , EPI.EpilogueUF , BestEpiPlan, EpilogILV,
1025310282 DT, true , &ExpandedSCEVs);
1025410283 ++LoopsEpilogueVectorized;
10284+ BasicBlock *PH = L->getLoopPreheader ();
1025510285
10286+ for (const auto &[IVPhi, _] : LVL.getInductionVars ()) {
10287+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock (PH));
10288+ const auto &[BB, V] = EpilogILV.getInductionBypassValue (IVPhi);
10289+ Inc->setIncomingValueForBlock (BB, V);
10290+ }
1025610291 if (!MainILV.areSafetyChecksAdded ())
1025710292 DisableRuntimeUnroll = true ;
1025810293 } else {
1025910294 InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
1026010295 VF.MinProfitableTripCount , IC, &LVL, &CM, BFI,
10261- PSI, Checks);
10296+ PSI, Checks, BestPlan );
1026210297 LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
1026310298 ++LoopsVectorized;
1026410299
0 commit comments