@@ -467,11 +467,12 @@ class InnerLoopVectorizer {
467
467
ElementCount MinProfitableTripCount,
468
468
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
469
469
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
470
- ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
470
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
471
+ VPlan &Plan)
471
472
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
472
473
AC (AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
473
474
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
474
- PSI(PSI), RTChecks(RTChecks) {
475
+ PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
475
476
// Query this against the original loop and save it here because the profile
476
477
// of the original loop header may change as the transformation happens.
477
478
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize (
@@ -522,7 +523,7 @@ class InnerLoopVectorizer {
522
523
// / and the resume values can come from an additional bypass block, the \p
523
524
// / AdditionalBypass pair provides information about the bypass block and the
524
525
// / end value on the edge from bypass to this loop.
525
- PHINode * createInductionResumeValue (
526
+ void createInductionResumeValue (
526
527
PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
527
528
ArrayRef<BasicBlock *> BypassBlocks,
528
529
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr , nullptr });
@@ -535,6 +536,11 @@ class InnerLoopVectorizer {
535
536
// / count of the original loop for both main loop and epilogue vectorization.
536
537
void setTripCount (Value *TC) { TripCount = TC; }
537
538
539
+ std::pair<BasicBlock *, Value *>
540
+ getInductionBypassValue (PHINode *OrigPhi) const {
541
+ return InductionBypassValues.find (OrigPhi)->second ;
542
+ }
543
+
538
544
protected:
539
545
friend class LoopVectorizationPlanner ;
540
546
@@ -674,6 +680,11 @@ class InnerLoopVectorizer {
674
680
// / Structure to hold information about generated runtime checks, responsible
675
681
// / for cleaning the checks, if vectorization turns out unprofitable.
676
682
GeneratedRTChecks &RTChecks;
683
+
684
+ // / Mapping of induction phis to their bypass values and bypass blocks.
685
+ DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues;
686
+
687
+ VPlan &Plan;
677
688
};
678
689
679
690
// / Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -715,10 +726,10 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
715
726
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
716
727
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
717
728
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
718
- GeneratedRTChecks &Checks)
729
+ GeneratedRTChecks &Checks, VPlan &Plan )
719
730
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
720
731
EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
721
- CM, BFI, PSI, Checks),
732
+ CM, BFI, PSI, Checks, Plan ),
722
733
EPI (EPI) {}
723
734
724
735
// Override this function to handle the more complex control flow around the
@@ -755,9 +766,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
755
766
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
756
767
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
757
768
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
758
- GeneratedRTChecks &Check)
769
+ GeneratedRTChecks &Check, VPlan &Plan )
759
770
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
760
- EPI, LVL, CM, BFI, PSI, Check) {}
771
+ EPI, LVL, CM, BFI, PSI, Check, Plan ) {}
761
772
// / Implements the interface for creating a vectorized skeleton using the
762
773
// / *main loop* strategy (ie the first pass of vplan execution).
763
774
std::pair<BasicBlock *, Value *>
@@ -789,9 +800,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
789
800
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
790
801
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
791
802
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
792
- GeneratedRTChecks &Checks)
803
+ GeneratedRTChecks &Checks, VPlan &Plan )
793
804
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
794
- EPI, LVL, CM, BFI, PSI, Checks) {
805
+ EPI, LVL, CM, BFI, PSI, Checks, Plan ) {
795
806
TripCount = EPI.TripCount ;
796
807
}
797
808
// / Implements the interface for creating a vectorized skeleton using the
@@ -2586,7 +2597,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2586
2597
nullptr , Twine (Prefix) + " scalar.ph" );
2587
2598
}
2588
2599
2589
- PHINode *InnerLoopVectorizer::createInductionResumeValue (
2600
+ static void addOperandToPhiInVPIRBasicBlock (VPIRBasicBlock *VPBB, PHINode *P,
2601
+ VPValue *Op) {
2602
+ for (VPRecipeBase &R : *VPBB) {
2603
+ auto *IRI = cast<VPIRInstruction>(&R);
2604
+ if (&IRI->getInstruction () == P) {
2605
+ IRI->addOperand (Op);
2606
+ break ;
2607
+ }
2608
+ }
2609
+ }
2610
+
2611
+ void InnerLoopVectorizer::createInductionResumeValue (
2590
2612
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2591
2613
ArrayRef<BasicBlock *> BypassBlocks,
2592
2614
std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -2621,27 +2643,28 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
2621
2643
}
2622
2644
}
2623
2645
2624
- // Create phi nodes to merge from the backedge-taken check block.
2625
- PHINode *BCResumeVal =
2626
- PHINode::Create (OrigPhi->getType (), 3 , " bc.resume.val" ,
2627
- LoopScalarPreHeader->getFirstNonPHIIt ());
2628
- // Copy original phi DL over to the new one.
2629
- BCResumeVal->setDebugLoc (OrigPhi->getDebugLoc ());
2646
+ VPBasicBlock *MiddleVPBB =
2647
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
2630
2648
2631
- // The new PHI merges the original incoming value, in case of a bypass,
2632
- // or the value at the end of the vectorized loop.
2633
- BCResumeVal->addIncoming (EndValue, LoopMiddleBlock);
2649
+ VPBasicBlock *ScalarPHVPBB = nullptr ;
2650
+ if (MiddleVPBB->getNumSuccessors () == 2 ) {
2651
+ // Order is strict: first is the exit block, second is the scalar preheader.
2652
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors ()[1 ]);
2653
+ } else {
2654
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor ());
2655
+ }
2634
2656
2635
- // Fix the scalar body counter (PHI node).
2636
- // The old induction's phi node in the scalar body needs the truncated
2637
- // value.
2638
- for (BasicBlock *BB : BypassBlocks)
2639
- BCResumeVal-> addIncoming (II. getStartValue ( ), BB );
2657
+ VPBuilder ScalarPHBuilder (ScalarPHVPBB);
2658
+ auto *ResumePhiRecipe = ScalarPHBuilder. createNaryOp (
2659
+ VPInstruction::ResumePhi,
2660
+ {Plan. getOrAddLiveIn (EndValue), Plan. getOrAddLiveIn (II. getStartValue ())},
2661
+ OrigPhi-> getDebugLoc ( ), " bc.resume.val " );
2640
2662
2641
- if (AdditionalBypass.first )
2642
- BCResumeVal->setIncomingValueForBlock (AdditionalBypass.first ,
2643
- EndValueFromAdditionalBypass);
2644
- return BCResumeVal;
2663
+ auto *ScalarLoopHeader =
2664
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ());
2665
+ addOperandToPhiInVPIRBasicBlock (ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
2666
+ InductionBypassValues[OrigPhi] = {AdditionalBypass.first ,
2667
+ EndValueFromAdditionalBypass};
2645
2668
}
2646
2669
2647
2670
// / Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2674,10 +2697,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
2674
2697
for (const auto &InductionEntry : Legal->getInductionVars ()) {
2675
2698
PHINode *OrigPhi = InductionEntry.first ;
2676
2699
const InductionDescriptor &II = InductionEntry.second ;
2677
- PHINode *BCResumeVal = createInductionResumeValue (
2678
- OrigPhi, II, getExpandedStep (II, ExpandedSCEVs), LoopBypassBlocks,
2679
- AdditionalBypass);
2680
- OrigPhi->setIncomingValueForBlock (LoopScalarPreHeader, BCResumeVal);
2700
+ createInductionResumeValue (OrigPhi, II, getExpandedStep (II, ExpandedSCEVs),
2701
+ LoopBypassBlocks, AdditionalBypass);
2681
2702
}
2682
2703
}
2683
2704
@@ -7787,6 +7808,25 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7787
7808
// the second pass for the scalar loop. The induction resume values for the
7788
7809
// inductions in the epilogue loop are created before executing the plan for
7789
7810
// the epilogue loop.
7811
+ for (VPRecipeBase &R :
7812
+ Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
7813
+ // Create induction resume values for both widened pointer and
7814
+ // integer/fp inductions and update the start value of the induction
7815
+ // recipes to use the resume value.
7816
+ PHINode *IndPhi = nullptr ;
7817
+ const InductionDescriptor *ID;
7818
+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
7819
+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue ());
7820
+ ID = &Ind->getInductionDescriptor ();
7821
+ } else if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
7822
+ IndPhi = WidenInd->getPHINode ();
7823
+ ID = &WidenInd->getInductionDescriptor ();
7824
+ } else
7825
+ continue ;
7826
+
7827
+ createInductionResumeValue (IndPhi, *ID, getExpandedStep (*ID, ExpandedSCEVs),
7828
+ LoopBypassBlocks);
7829
+ }
7790
7830
7791
7831
return {LoopVectorPreHeader, nullptr };
7792
7832
}
@@ -8945,14 +8985,9 @@ static void addLiveOutsForFirstOrderRecurrences(
8945
8985
VPInstruction::ResumePhi, {Resume, FOR->getStartValue ()}, {},
8946
8986
" scalar.recur.init" );
8947
8987
auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr ());
8948
- for (VPRecipeBase &R :
8949
- *cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ())) {
8950
- auto *IRI = cast<VPIRInstruction>(&R);
8951
- if (&IRI->getInstruction () == FORPhi) {
8952
- IRI->addOperand (ResumePhiRecipe);
8953
- break ;
8954
- }
8955
- }
8988
+ addOperandToPhiInVPIRBasicBlock (
8989
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ()), FORPhi,
8990
+ ResumePhiRecipe);
8956
8991
8957
8992
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
8958
8993
// Extract the penultimate value of the recurrence and use it as operand for
@@ -9679,7 +9714,7 @@ static bool processLoopInVPlanNativePath(
9679
9714
GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
9680
9715
AddBranchWeights);
9681
9716
InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
9682
- VF.Width , 1 , LVL, &CM, BFI, PSI, Checks);
9717
+ VF.Width , 1 , LVL, &CM, BFI, PSI, Checks, BestPlan );
9683
9718
LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
9684
9719
<< L->getHeader ()->getParent ()->getName () << " \"\n " );
9685
9720
LVP.executePlan (VF.Width , 1 , BestPlan, LB, DT, false );
@@ -10167,11 +10202,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10167
10202
assert (IC > 1 && " interleave count should not be 1 or 0" );
10168
10203
// If we decided that it is not legal to vectorize the loop, then
10169
10204
// interleave it.
10205
+ VPlan &BestPlan = LVP.getPlanFor (VF.Width );
10170
10206
InnerLoopVectorizer Unroller (
10171
10207
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed (1 ),
10172
- ElementCount::getFixed (1 ), IC, &LVL, &CM, BFI, PSI, Checks);
10208
+ ElementCount::getFixed (1 ), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan );
10173
10209
10174
- VPlan &BestPlan = LVP.getPlanFor (VF.Width );
10175
10210
LVP.executePlan (VF.Width , IC, BestPlan, Unroller, DT, false );
10176
10211
10177
10212
ORE->emit ([&]() {
@@ -10193,10 +10228,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10193
10228
// to be vectorized by executing the plan (potentially with a different
10194
10229
// factor) again shortly afterwards.
10195
10230
EpilogueLoopVectorizationInfo EPI (VF.Width , IC, EpilogueVF.Width , 1 );
10231
+ std::unique_ptr<VPlan> BestMainPlan (BestPlan.duplicate ());
10196
10232
EpilogueVectorizerMainLoop MainILV (L, PSE, LI, DT, TLI, TTI, AC, ORE,
10197
- EPI, &LVL, &CM, BFI, PSI, Checks);
10233
+ EPI, &LVL, &CM, BFI, PSI, Checks,
10234
+ *BestMainPlan);
10198
10235
10199
- std::unique_ptr<VPlan> BestMainPlan (BestPlan.duplicate ());
10200
10236
auto ExpandedSCEVs = LVP.executePlan (EPI.MainLoopVF , EPI.MainLoopUF ,
10201
10237
*BestMainPlan, MainILV, DT, true );
10202
10238
++LoopsVectorized;
@@ -10205,11 +10241,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10205
10241
// edges from the first pass.
10206
10242
EPI.MainLoopVF = EPI.EpilogueVF ;
10207
10243
EPI.MainLoopUF = EPI.EpilogueUF ;
10244
+ VPlan &BestEpiPlan = LVP.getPlanFor (EPI.EpilogueVF );
10208
10245
EpilogueVectorizerEpilogueLoop EpilogILV (L, PSE, LI, DT, TLI, TTI, AC,
10209
10246
ORE, EPI, &LVL, &CM, BFI, PSI,
10210
- Checks);
10247
+ Checks, BestEpiPlan );
10211
10248
10212
- VPlan &BestEpiPlan = LVP.getPlanFor (EPI.EpilogueVF );
10213
10249
VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion ();
10214
10250
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock ();
10215
10251
Header->setName (" vec.epilog.vector.body" );
@@ -10258,23 +10294,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10258
10294
RdxDesc.getRecurrenceStartValue ());
10259
10295
}
10260
10296
} else {
10261
- // Create induction resume values for both widened pointer and
10262
- // integer/fp inductions and update the start value of the induction
10263
- // recipes to use the resume value.
10297
+ // Retrive the induction resume values for wide inductions from
10298
+ // their original phi nodes in the scalar loop
10264
10299
PHINode *IndPhi = nullptr ;
10265
- const InductionDescriptor *ID;
10266
10300
if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10267
10301
IndPhi = cast<PHINode>(Ind->getUnderlyingValue ());
10268
- ID = &Ind->getInductionDescriptor ();
10269
10302
} else {
10270
10303
auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10271
10304
IndPhi = WidenInd->getPHINode ();
10272
- ID = &WidenInd->getInductionDescriptor ();
10273
10305
}
10274
-
10275
- ResumeV = MainILV.createInductionResumeValue (
10276
- IndPhi, *ID, getExpandedStep (*ID, ExpandedSCEVs),
10277
- {EPI.MainLoopIterationCountCheck });
10306
+ ResumeV = IndPhi->getIncomingValueForBlock (L->getLoopPreheader ());
10278
10307
}
10279
10308
assert (ResumeV && " Must have a resume value" );
10280
10309
VPValue *StartVal = BestEpiPlan.getOrAddLiveIn (ResumeV);
@@ -10286,13 +10315,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10286
10315
LVP.executePlan (EPI.EpilogueVF , EPI.EpilogueUF , BestEpiPlan, EpilogILV,
10287
10316
DT, true , &ExpandedSCEVs);
10288
10317
++LoopsEpilogueVectorized;
10318
+ BasicBlock *PH = L->getLoopPreheader ();
10289
10319
10320
+ for (const auto &[IVPhi, _] : LVL.getInductionVars ()) {
10321
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock (PH));
10322
+ const auto &[BB, V] = EpilogILV.getInductionBypassValue (IVPhi);
10323
+ Inc->setIncomingValueForBlock (BB, V);
10324
+ }
10290
10325
if (!MainILV.areSafetyChecksAdded ())
10291
10326
DisableRuntimeUnroll = true ;
10292
10327
} else {
10293
10328
InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
10294
10329
VF.MinProfitableTripCount , IC, &LVL, &CM, BFI,
10295
- PSI, Checks);
10330
+ PSI, Checks, BestPlan );
10296
10331
LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10297
10332
++LoopsVectorized;
10298
10333
0 commit comments