@@ -399,12 +399,6 @@ static cl::opt<bool> EnableEarlyExitVectorization(
399399 cl::desc(
400400 " Enable vectorization of early exit loops with uncountable exits." ));
401401
402- // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
403- // variables not overflowing do not hold. See `emitSCEVChecks`.
404- static constexpr uint32_t SCEVCheckBypassWeights[] = {1 , 127 };
405- // Likelyhood of bypassing the vectorized loop because pointers overlap. See
406- // `emitMemRuntimeChecks`.
407- static constexpr uint32_t MemCheckBypassWeights[] = {1 , 127 };
408402// Likelyhood of bypassing the vectorized loop because there are zero trips left
409403// after prolog. See `emitIterationCountCheck`.
410404static constexpr uint32_t MinItersBypassWeights[] = {1 , 127 };
@@ -544,16 +538,6 @@ class InnerLoopVectorizer {
544538 // / it overflows.
545539 void emitIterationCountCheck (BasicBlock *Bypass);
546540
547- // / Emit a bypass check to see if all of the SCEV assumptions we've
548- // / had to make are correct. Returns the block containing the checks or
549- // / nullptr if no checks have been added.
550- BasicBlock *emitSCEVChecks (BasicBlock *Bypass);
551-
552- // / Emit bypass checks to check any memory assumptions we may have made.
553- // / Returns the block containing the checks or nullptr if no checks have been
554- // / added.
555- BasicBlock *emitMemRuntimeChecks (BasicBlock *Bypass);
556-
557541 // / Emit basic blocks (prefixed with \p Prefix) for the iteration check,
558542 // / vector loop preheader, middle block and scalar preheader.
559543 void createVectorLoopSkeleton (StringRef Prefix);
@@ -657,8 +641,6 @@ struct EpilogueLoopVectorizationInfo {
657641 unsigned EpilogueUF = 0 ;
658642 BasicBlock *MainLoopIterationCountCheck = nullptr ;
659643 BasicBlock *EpilogueIterationCountCheck = nullptr ;
660- BasicBlock *SCEVSafetyCheck = nullptr ;
661- BasicBlock *MemSafetyCheck = nullptr ;
662644 Value *TripCount = nullptr ;
663645 Value *VectorTripCount = nullptr ;
664646 VPlan &EpiloguePlan;
@@ -1786,7 +1768,6 @@ class GeneratedRTChecks {
17861768 SCEVExpander MemCheckExp;
17871769
17881770 bool CostTooHigh = false ;
1789- const bool AddBranchWeights;
17901771
17911772 Loop *OuterLoop = nullptr ;
17921773
@@ -1798,11 +1779,10 @@ class GeneratedRTChecks {
17981779public:
17991780 GeneratedRTChecks (PredicatedScalarEvolution &PSE, DominatorTree *DT,
18001781 LoopInfo *LI, TargetTransformInfo *TTI,
1801- const DataLayout &DL, bool AddBranchWeights,
1802- TTI::TargetCostKind CostKind)
1782+ const DataLayout &DL, TTI::TargetCostKind CostKind)
18031783 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, " scev.check" ),
1804- MemCheckExp (*PSE.getSE(), DL, "scev.check"),
1805- AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1784+ MemCheckExp (*PSE.getSE(), DL, "scev.check"), PSE(PSE),
1785+ CostKind(CostKind) {}
18061786
18071787 // / Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
18081788 // / accurately estimate the cost of the runtime checks. The blocks are
@@ -2019,56 +1999,20 @@ class GeneratedRTChecks {
20191999 MemCheckBlock->eraseFromParent ();
20202000 }
20212001
2022- // / Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2023- // / adjusts the branches to branch to the vector preheader or \p Bypass,
2024- // / depending on the generated condition.
2025- BasicBlock *emitSCEVChecks (BasicBlock *Bypass,
2026- BasicBlock *LoopVectorPreHeader) {
2002+ // / Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2003+ // / outside VPlan.
2004+ std::pair<Value *, BasicBlock *> getSCEVChecks () {
20272005 using namespace llvm ::PatternMatch;
20282006 if (!SCEVCheckCond || match (SCEVCheckCond, m_ZeroInt ()))
2029- return nullptr ;
2030-
2031- auto *Pred = LoopVectorPreHeader->getSinglePredecessor ();
2032- BranchInst::Create (LoopVectorPreHeader, SCEVCheckBlock);
2033-
2034- SCEVCheckBlock->getTerminator ()->eraseFromParent ();
2035- SCEVCheckBlock->moveBefore (LoopVectorPreHeader);
2036- Pred->getTerminator ()->replaceSuccessorWith (LoopVectorPreHeader,
2037- SCEVCheckBlock);
2038-
2039- BranchInst &BI =
2040- *BranchInst::Create (Bypass, LoopVectorPreHeader, SCEVCheckCond);
2041- if (AddBranchWeights)
2042- setBranchWeights (BI, SCEVCheckBypassWeights, /* IsExpected=*/ false );
2043- ReplaceInstWithInst (SCEVCheckBlock->getTerminator (), &BI);
2044- return SCEVCheckBlock;
2045- }
2046-
2047- // / Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2048- // / the branches to branch to the vector preheader or \p Bypass, depending on
2049- // / the generated condition.
2050- BasicBlock *emitMemRuntimeChecks (BasicBlock *Bypass,
2051- BasicBlock *LoopVectorPreHeader) {
2052- // Check if we generated code that checks in runtime if arrays overlap.
2053- if (!MemRuntimeCheckCond)
2054- return nullptr ;
2055-
2056- auto *Pred = LoopVectorPreHeader->getSinglePredecessor ();
2057- Pred->getTerminator ()->replaceSuccessorWith (LoopVectorPreHeader,
2058- MemCheckBlock);
2007+ return {nullptr , nullptr };
20592008
2060- MemCheckBlock->moveBefore (LoopVectorPreHeader);
2061-
2062- BranchInst &BI =
2063- *BranchInst::Create (Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2064- if (AddBranchWeights) {
2065- setBranchWeights (BI, MemCheckBypassWeights, /* IsExpected=*/ false );
2066- }
2067- ReplaceInstWithInst (MemCheckBlock->getTerminator (), &BI);
2068- MemCheckBlock->getTerminator ()->setDebugLoc (
2069- Pred->getTerminator ()->getDebugLoc ());
2009+ return {SCEVCheckCond, SCEVCheckBlock};
2010+ }
20702011
2071- return MemCheckBlock;
2012+ // / Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2013+ // / outside VPlan.
2014+ std::pair<Value *, BasicBlock *> getMemRuntimeChecks () {
2015+ return {MemRuntimeCheckCond, MemCheckBlock};
20722016 }
20732017
20742018 // / Return true if any runtime checks have been added
@@ -2461,53 +2405,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
24612405 " Plan's entry must be TCCCheckBlock" );
24622406}
24632407
2464- BasicBlock *InnerLoopVectorizer::emitSCEVChecks (BasicBlock *Bypass) {
2465- BasicBlock *const SCEVCheckBlock =
2466- RTChecks.emitSCEVChecks (Bypass, LoopVectorPreHeader);
2467- if (!SCEVCheckBlock)
2468- return nullptr ;
2469-
2470- assert ((!Cost->OptForSize ||
2471- Cost->Hints ->getForce () == LoopVectorizeHints::FK_Enabled) &&
2472- " Cannot SCEV check stride or overflow when optimizing for size" );
2473-
2474- introduceCheckBlockInVPlan (SCEVCheckBlock);
2475- return SCEVCheckBlock;
2476- }
2477-
2478- BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks (BasicBlock *Bypass) {
2479- BasicBlock *const MemCheckBlock =
2480- RTChecks.emitMemRuntimeChecks (Bypass, LoopVectorPreHeader);
2481-
2482- // Check if we generated code that checks in runtime if arrays overlap. We put
2483- // the checks into a separate block to make the more common case of few
2484- // elements faster.
2485- if (!MemCheckBlock)
2486- return nullptr ;
2487-
2488- // VPlan-native path does not do any analysis for runtime checks currently.
2489- assert ((!EnableVPlanNativePath || OrigLoop->begin () == OrigLoop->end ()) &&
2490- " Runtime checks are not supported for outer loops yet" );
2491-
2492- if (Cost->OptForSize ) {
2493- assert (Cost->Hints ->getForce () == LoopVectorizeHints::FK_Enabled &&
2494- " Cannot emit memory checks when optimizing for size, unless forced "
2495- " to vectorize." );
2496- ORE->emit ([&]() {
2497- return OptimizationRemarkAnalysis (DEBUG_TYPE, " VectorizationCodeSize" ,
2498- OrigLoop->getStartLoc (),
2499- OrigLoop->getHeader ())
2500- << " Code-size may be reduced by not forcing "
2501- " vectorization, or by source-code modifications "
2502- " eliminating the need for runtime checks "
2503- " (e.g., adding 'restrict')." ;
2504- });
2505- }
2506-
2507- introduceCheckBlockInVPlan (MemCheckBlock);
2508- return MemCheckBlock;
2509- }
2510-
25112408// / Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
25122409// / VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
25132410// / have a single predecessor, which is rewired to the new VPIRBasicBlock. All
@@ -2624,15 +2521,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
26242521 // to the scalar loop.
26252522 emitIterationCountCheck (LoopScalarPreHeader);
26262523
2627- // Generate the code to check any assumptions that we've made for SCEV
2628- // expressions.
2629- emitSCEVChecks (LoopScalarPreHeader);
2630-
2631- // Generate the code that checks in runtime if arrays overlap. We put the
2632- // checks into a separate block to make the more common case of few elements
2633- // faster.
2634- emitMemRuntimeChecks (LoopScalarPreHeader);
2635-
26362524 replaceVPBBWithIRVPBB (Plan.getScalarPreheader (), LoopScalarPreHeader);
26372525 return LoopVectorPreHeader;
26382526}
@@ -7323,11 +7211,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73237211 OrigLoop->getHeader ()->getContext ());
73247212 VPlanTransforms::runPass (VPlanTransforms::replicateByVF, BestVPlan, BestVF);
73257213 VPlanTransforms::runPass (VPlanTransforms::materializeBroadcasts, BestVPlan);
7326- if (hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ())) {
7214+ bool HasBranchWeights =
7215+ hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ());
7216+ if (HasBranchWeights) {
73277217 std::optional<unsigned > VScale = CM.getVScaleForTuning ();
73287218 VPlanTransforms::runPass (VPlanTransforms::addBranchWeightToMiddleTerminator,
73297219 BestVPlan, BestVF, VScale);
73307220 }
7221+
7222+ if (!VectorizingEpilogue) {
7223+ // Checks are the same for all VPlans, added to BestVPlan only for
7224+ // compactness.
7225+ attachRuntimeChecks (BestVPlan, ILV.RTChecks , HasBranchWeights);
7226+ }
7227+
7228+ // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7229+ VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader ());
73317230 VPlanTransforms::optimizeForVFAndUF (BestVPlan, BestVF, BestUF, PSE);
73327231 VPlanTransforms::simplifyRecipes (BestVPlan, *Legal->getWidestInductionType ());
73337232 VPlanTransforms::narrowInterleaveGroups (
@@ -7375,7 +7274,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73757274
73767275 // 1. Set up the skeleton for vectorization, including vector pre-header and
73777276 // middle block. The vector loop is created during VPlan execution.
7378- VPBasicBlock *VectorPH = cast<VPBasicBlock>(Entry->getSuccessors ()[1 ]);
7277+ BasicBlock *EntryBB =
7278+ cast<VPIRBasicBlock>(BestVPlan.getEntry ())->getIRBasicBlock ();
73797279 State.CFG .PrevBB = ILV.createVectorizedLoopSkeleton ();
73807280 if (VectorizingEpilogue)
73817281 VPlanTransforms::removeDeadRecipes (BestVPlan);
@@ -7399,6 +7299,13 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73997299 ILV.getOrCreateVectorTripCount (ILV.LoopVectorPreHeader ), State);
74007300 replaceVPBBWithIRVPBB (VectorPH, State.CFG .PrevBB );
74017301
7302+ // Move check blocks to their final position.
7303+ // TODO: Move as part of VPIRBB execute and update impacted tests.
7304+ if (BasicBlock *MemCheckBlock = ILV.RTChecks .getMemRuntimeChecks ().second )
7305+ MemCheckBlock->moveAfter (EntryBB);
7306+ if (BasicBlock *SCEVCheckBlock = ILV.RTChecks .getSCEVChecks ().second )
7307+ SCEVCheckBlock->moveAfter (EntryBB);
7308+
74027309 BestVPlan.execute (&State);
74037310
74047311 // 2.5 When vectorizing the epilogue, fix reduction resume values from the
@@ -7499,15 +7406,6 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
74997406 emitIterationCountCheck (LoopScalarPreHeader, true );
75007407 EPI.EpilogueIterationCountCheck ->setName (" iter.check" );
75017408
7502- // Generate the code to check any assumptions that we've made for SCEV
7503- // expressions.
7504- EPI.SCEVSafetyCheck = emitSCEVChecks (LoopScalarPreHeader);
7505-
7506- // Generate the code that checks at runtime if arrays overlap. We put the
7507- // checks into a separate block to make the more common case of few elements
7508- // faster.
7509- EPI.MemSafetyCheck = emitMemRuntimeChecks (LoopScalarPreHeader);
7510-
75117409 // Generate the iteration count check for the main loop, *after* the check
75127410 // for the epilogue loop, so that the path-length is shorter for the case
75137411 // that goes directly through the vector epilogue. The longer-path length for
@@ -7611,11 +7509,14 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
76117509 EPI.EpilogueIterationCountCheck ->getTerminator ()->replaceUsesOfWith (
76127510 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
76137511
7614- if (EPI.SCEVSafetyCheck )
7615- EPI.SCEVSafetyCheck ->getTerminator ()->replaceUsesOfWith (
7512+ // Adjust the terminators of runtime check blocks and phis using them.
7513+ BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks ().second ;
7514+ BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks ().second ;
7515+ if (SCEVCheckBlock)
7516+ SCEVCheckBlock->getTerminator ()->replaceUsesOfWith (
76167517 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7617- if (EPI. MemSafetyCheck )
7618- EPI. MemSafetyCheck ->getTerminator ()->replaceUsesOfWith (
7518+ if (MemCheckBlock )
7519+ MemCheckBlock ->getTerminator ()->replaceUsesOfWith (
76197520 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
76207521
76217522 DT->changeImmediateDominator (LoopScalarPreHeader,
@@ -7642,10 +7543,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
76427543 }))
76437544 continue ;
76447545 Phi->removeIncomingValue (EPI.EpilogueIterationCountCheck );
7645- if (EPI. SCEVSafetyCheck )
7646- Phi->removeIncomingValue (EPI. SCEVSafetyCheck );
7647- if (EPI. MemSafetyCheck )
7648- Phi->removeIncomingValue (EPI. MemSafetyCheck );
7546+ if (SCEVCheckBlock )
7547+ Phi->removeIncomingValue (SCEVCheckBlock );
7548+ if (MemCheckBlock )
7549+ Phi->removeIncomingValue (MemCheckBlock );
76497550 }
76507551
76517552 replaceVPBBWithIRVPBB (Plan.getScalarPreheader (), LoopScalarPreHeader);
@@ -9380,6 +9281,43 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
93809281 VPlanTransforms::runPass (VPlanTransforms::clearReductionWrapFlags, *Plan);
93819282}
93829283
9284+ void LoopVectorizationPlanner::attachRuntimeChecks (
9285+ VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
9286+ const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks ();
9287+ if (SCEVCheckBlock) {
9288+ assert ((!CM.OptForSize ||
9289+ CM.Hints ->getForce () == LoopVectorizeHints::FK_Enabled) &&
9290+ " Cannot SCEV check stride or overflow when optimizing for size" );
9291+ VPlanTransforms::attachCheckBlock (Plan, SCEVCheckCond, SCEVCheckBlock,
9292+ HasBranchWeights);
9293+ }
9294+ const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks ();
9295+ if (MemCheckBlock) {
9296+ // VPlan-native path does not do any analysis for runtime checks
9297+ // currently.
9298+ assert ((!EnableVPlanNativePath || OrigLoop->isInnermost ()) &&
9299+ " Runtime checks are not supported for outer loops yet" );
9300+
9301+ if (CM.OptForSize ) {
9302+ assert (
9303+ CM.Hints ->getForce () == LoopVectorizeHints::FK_Enabled &&
9304+ " Cannot emit memory checks when optimizing for size, unless forced "
9305+ " to vectorize." );
9306+ ORE->emit ([&]() {
9307+ return OptimizationRemarkAnalysis (DEBUG_TYPE, " VectorizationCodeSize" ,
9308+ OrigLoop->getStartLoc (),
9309+ OrigLoop->getHeader ())
9310+ << " Code-size may be reduced by not forcing "
9311+ " vectorization, or by source-code modifications "
9312+ " eliminating the need for runtime checks "
9313+ " (e.g., adding 'restrict')." ;
9314+ });
9315+ }
9316+ VPlanTransforms::attachCheckBlock (Plan, MemCheckCond, MemCheckBlock,
9317+ HasBranchWeights);
9318+ }
9319+ }
9320+
93839321void VPDerivedIVRecipe::execute (VPTransformState &State) {
93849322 assert (!State.Lane && " VPDerivedIVRecipe being replicated." );
93859323
@@ -9501,10 +9439,7 @@ static bool processLoopInVPlanNativePath(
95019439 VPlan &BestPlan = LVP.getPlanFor (VF.Width );
95029440
95039441 {
9504- bool AddBranchWeights =
9505- hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
9506- GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
9507- AddBranchWeights, CM.CostKind );
9442+ GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM.CostKind );
95089443 InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
95099444 VF.Width , 1 , &CM, BFI, PSI, Checks, BestPlan);
95109445 LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
@@ -10142,10 +10077,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1014210077 if (ORE->allowExtraAnalysis (LV_NAME))
1014310078 LVP.emitInvalidCostRemarks (ORE);
1014410079
10145- bool AddBranchWeights =
10146- hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
10147- GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
10148- AddBranchWeights, CM.CostKind );
10080+ GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM.CostKind );
1014910081 if (LVP.hasPlanWithVF (VF.Width )) {
1015010082 // Select the interleave count.
1015110083 IC = CM.selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
0 commit comments