@@ -173,6 +173,7 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
173173STATISTIC (LoopsVectorized, " Number of loops vectorized" );
174174STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
175175STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
176+ STATISTIC (LoopsAliasMasked, " Number of loops predicated with an alias mask" );
176177
177178static cl::opt<bool > EnableEpilogueVectorization (
178179 " enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -1806,6 +1807,10 @@ class GeneratedRTChecks {
18061807 PredicatedScalarEvolution &PSE;
18071808
18081809public:
1810+ // / Set by VPlan when the vector loop should be entered even when runtime
1811+ // / checks determine that pointers alias within an iteration.
1812+ bool HasAliasMask = false ;
1813+
18091814 GeneratedRTChecks (PredicatedScalarEvolution &PSE, DominatorTree *DT,
18101815 LoopInfo *LI, TargetTransformInfo *TTI,
18111816 const DataLayout &DL, bool AddBranchWeights)
@@ -1847,9 +1852,11 @@ class GeneratedRTChecks {
18471852
18481853 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking ();
18491854 if (RtPtrChecking.Need ) {
1850- auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1851- MemCheckBlock = SplitBlock (Pred, Pred->getTerminator (), DT, LI, nullptr ,
1852- " vector.memcheck" );
1855+ if (!MemCheckBlock) {
1856+ auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1857+ MemCheckBlock = SplitBlock (Pred, Pred->getTerminator (), DT, LI, nullptr ,
1858+ " vector.memcheck" );
1859+ }
18531860
18541861 auto DiffChecks = RtPtrChecking.getDiffChecks ();
18551862 if (DiffChecks) {
@@ -2077,11 +2084,18 @@ class GeneratedRTChecks {
20772084 if (OuterLoop)
20782085 OuterLoop->addBasicBlockToLoop (MemCheckBlock, *LI);
20792086
2080- BranchInst &BI =
2081- *BranchInst::Create (Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2082- if (AddBranchWeights) {
2087+ // TODO: Branch to the vector preheader conditionally based on the number of
2088+ // non-aliasing elements. The scalar loop will likely be better if only one
2089+ // or two elements will be processed per vectorised loop iteration.
2090+
2091+ // Jump to the vector preheader unconditionally if it's safe to do so
2092+ // because an alias mask has been set up.
2093+ BranchInst &BI = HasAliasMask
2094+ ? *BranchInst::Create (LoopVectorPreHeader)
2095+ : *BranchInst::Create (Bypass, LoopVectorPreHeader,
2096+ MemRuntimeCheckCond);
2097+ if (!HasAliasMask && AddBranchWeights)
20832098 setBranchWeights (BI, MemCheckBypassWeights, /* IsExpected=*/ false );
2084- }
20852099 ReplaceInstWithInst (MemCheckBlock->getTerminator (), &BI);
20862100 MemCheckBlock->getTerminator ()->setDebugLoc (
20872101 Pred->getTerminator ()->getDebugLoc ());
@@ -2564,7 +2578,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
25642578 });
25652579 }
25662580
2567- LoopBypassBlocks.push_back (MemCheckBlock);
2581+ // / If an alias mask has been set up then we don't need the bypass as the
2582+ // / vector preheader will be branched to unconditionally
2583+ if (!RTChecks.HasAliasMask )
2584+ LoopBypassBlocks.push_back (MemCheckBlock);
25682585
25692586 AddedSafetyChecks = true ;
25702587
@@ -7097,7 +7114,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
70977114 return VectorizationFactor::Disabled ();
70987115}
70997116
7100- void LoopVectorizationPlanner::plan (ElementCount UserVF, unsigned UserIC) {
7117+ void LoopVectorizationPlanner::plan (
7118+ ElementCount UserVF, unsigned UserIC,
7119+ std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
71017120 assert (OrigLoop->isInnermost () && " Inner loop expected." );
71027121 CM.collectValuesToIgnore ();
71037122 CM.collectElementTypesForWidening ();
@@ -7106,6 +7125,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
71067125 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
71077126 return ;
71087127
7128+ ArrayRef<PointerDiffInfo> DiffChecks;
7129+ if (RTChecks.has_value () && useActiveLaneMask (CM.getTailFoldingStyle (true )))
7130+ DiffChecks = *RTChecks;
7131+
71097132 // Invalidate interleave groups if all blocks of loop will be predicated.
71107133 if (CM.blockNeedsPredicationForAnyReason (OrigLoop->getHeader ()) &&
71117134 !useMaskedInterleavedAccesses (TTI)) {
@@ -7138,7 +7161,7 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
71387161 CM.collectInLoopReductions ();
71397162 if (CM.selectUserVectorizationFactor (UserVF)) {
71407163 LLVM_DEBUG (dbgs () << " LV: Using user VF " << UserVF << " .\n " );
7141- buildVPlansWithVPRecipes (UserVF, UserVF);
7164+ buildVPlansWithVPRecipes (UserVF, UserVF, DiffChecks, HasAliasMask );
71427165 LLVM_DEBUG (printPlans (dbgs ()));
71437166 return ;
71447167 }
@@ -7167,8 +7190,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
71677190 CM.collectInstsToScalarize (VF);
71687191 }
71697192
7170- buildVPlansWithVPRecipes (ElementCount::getFixed (1 ), MaxFactors.FixedVF );
7171- buildVPlansWithVPRecipes (ElementCount::getScalable (1 ), MaxFactors.ScalableVF );
7193+ buildVPlansWithVPRecipes (ElementCount::getFixed (1 ), MaxFactors.FixedVF ,
7194+ DiffChecks, HasAliasMask);
7195+ buildVPlansWithVPRecipes (ElementCount::getScalable (1 ), MaxFactors.ScalableVF ,
7196+ DiffChecks, HasAliasMask);
71727197
71737198 LLVM_DEBUG (printPlans (dbgs ()));
71747199}
@@ -7690,7 +7715,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
76907715 CanonicalIVStartValue, State);
76917716
76927717 BestVPlan.execute (&State);
7693-
76947718 // 2.5 Collect reduction resume values.
76957719 auto *ExitVPBB =
76967720 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion ()->getSingleSuccessor ());
@@ -7923,7 +7947,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
79237947 // reduction phis in the scalar loop preheader.
79247948 if (EPI.SCEVSafetyCheck )
79257949 LoopBypassBlocks.push_back (EPI.SCEVSafetyCheck );
7926- if (EPI.MemSafetyCheck )
7950+ if (EPI.MemSafetyCheck && !RTChecks. HasAliasMask )
79277951 LoopBypassBlocks.push_back (EPI.MemSafetyCheck );
79287952 LoopBypassBlocks.push_back (EPI.EpilogueIterationCountCheck );
79297953
@@ -8179,9 +8203,8 @@ void VPRecipeBuilder::createHeaderMask() {
81798203
81808204 VPBuilder::InsertPointGuard Guard (Builder);
81818205 Builder.setInsertPoint (HeaderVPBB, NewInsertionPoint);
8182- VPValue *BlockMask = nullptr ;
81838206 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount ();
8184- BlockMask = Builder.createICmp (CmpInst::ICMP_ULE, IV, BTC);
8207+ VPValue * BlockMask = Builder.createICmp (CmpInst::ICMP_ULE, IV, BTC);
81858208 BlockMaskCache[Header] = BlockMask;
81868209}
81878210
@@ -8720,14 +8743,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87208743 return tryToWiden (Instr, Operands, VPBB);
87218744}
87228745
8723- void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
8724- ElementCount MaxVF) {
8746+ void LoopVectorizationPlanner::buildVPlansWithVPRecipes (
8747+ ElementCount MinVF, ElementCount MaxVF, ArrayRef<PointerDiffInfo> RTChecks,
8748+ bool &HasAliasMask) {
87258749 assert (OrigLoop->isInnermost () && " Inner loop expected." );
87268750
87278751 auto MaxVFTimes2 = MaxVF * 2 ;
87288752 for (ElementCount VF = MinVF; ElementCount::isKnownLT (VF, MaxVFTimes2);) {
87298753 VFRange SubRange = {VF, MaxVFTimes2};
8730- if (auto Plan = tryToBuildVPlanWithVPRecipes (SubRange)) {
8754+ if (auto Plan =
8755+ tryToBuildVPlanWithVPRecipes (SubRange, RTChecks, HasAliasMask)) {
87318756 // Now optimize the initial VPlan.
87328757 if (!Plan->hasVF (ElementCount::getFixed (1 )))
87338758 VPlanTransforms::truncateToMinimalBitwidths (*Plan,
@@ -8760,6 +8785,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
87608785
87618786 VPBuilder Builder (TopRegion->getExitingBasicBlock ());
87628787 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8788+ // If an alias mask is present, this will be replaced by an increment of the
8789+ // mask's popcount.
87638790 auto *CanonicalIVIncrement = Builder.createOverflowingOp (
87648791 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF ()}, {HasNUW, false }, DL,
87658792 " index.next" );
@@ -8978,8 +9005,8 @@ static void addLiveOutsForFirstOrderRecurrences(
89789005 }
89799006}
89809007
8981- VPlanPtr
8982- LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes ( VFRange &Range) {
9008+ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes (
9009+ VFRange &Range, ArrayRef<PointerDiffInfo> RTChecks, bool &HasAliasMask ) {
89839010
89849011 SmallPtrSet<const InterleaveGroup<Instruction> *, 1 > InterleaveGroups;
89859012
@@ -9215,7 +9242,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
92159242 bool WithoutRuntimeCheck =
92169243 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
92179244 VPlanTransforms::addActiveLaneMask (*Plan, ForControlFlow,
9218- WithoutRuntimeCheck);
9245+ WithoutRuntimeCheck, PSE, RTChecks);
9246+ if (ForControlFlow && !RTChecks.empty ())
9247+ HasAliasMask = true ;
92199248 }
92209249 return Plan;
92219250}
@@ -9699,6 +9728,7 @@ static bool processLoopInVPlanNativePath(
96999728 // Mark the loop as already vectorized to avoid vectorizing again.
97009729 Hints.setAlreadyVectorized ();
97019730 assert (!verifyFunction (*L->getHeader ()->getParent (), &dbgs ()));
9731+
97029732 return true ;
97039733}
97049734
@@ -10030,18 +10060,23 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1003010060 ElementCount UserVF = Hints.getWidth ();
1003110061 unsigned UserIC = Hints.getInterleave ();
1003210062
10063+ bool AddBranchWeights =
10064+ hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
10065+ GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
10066+ AddBranchWeights);
10067+
1003310068 // Plan how to best vectorize.
10034- LVP.plan (UserVF, UserIC);
10069+ LVP.plan (UserVF, UserIC,
10070+ LVL.getLAI ()->getRuntimePointerChecking ()->getDiffChecks (),
10071+ Checks.HasAliasMask );
1003510072 VectorizationFactor VF = LVP.computeBestVF ();
10073+ if (Checks.HasAliasMask )
10074+ LoopsAliasMasked++;
1003610075 unsigned IC = 1 ;
1003710076
1003810077 if (ORE->allowExtraAnalysis (LV_NAME))
1003910078 LVP.emitInvalidCostRemarks (ORE);
1004010079
10041- bool AddBranchWeights =
10042- hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
10043- GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
10044- AddBranchWeights);
1004510080 if (LVP.hasPlanWithVF (VF.Width )) {
1004610081 // Select the interleave count.
1004710082 IC = CM.selectInterleaveCount (VF.Width , VF.Cost );
0 commit comments