diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1ac84ef80227e..0591c224424ed 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8384,8 +8384,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, LVer.prepareNoAliasMetadata(); } + // Create initial base VPlan0, to serve as common starting point for all + // candidates built later for specific VF ranges. + auto VPlan0 = VPlanTransforms::buildVPlan0( + OrigLoop, *LI, Legal->getWidestInductionType(), + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE); + auto MaxVFTimes2 = MaxVF * 2; - auto VPlan0 = VPlanTransforms::buildPlainCFG(OrigLoop, *LI); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; if (auto Plan = tryToBuildVPlanWithVPRecipes( @@ -8624,23 +8629,17 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - // Create initial VPlan skeleton, having a basic block for the pre-header - // which contains SCEV expansions that need to happen before the CFG is - // modified; a basic block for the vector pre-header, followed by a region for - // the vector loop, followed by the middle basic block. The skeleton vector - // loop region contains a header and latch basic blocks. - bool RequiresScalarEpilogueCheck = LoopVectorizationPlanner::getDecisionAndClampRange( [this](ElementCount VF) { return !CM.requiresScalarEpilogue(VF.isVector()); }, Range); - VPlanTransforms::prepareForVectorization( - *Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck, - CM.foldTailByMasking(), OrigLoop, - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), - Legal->hasUncountableEarlyExit(), Range); + VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(), + Range); + VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck, + CM.foldTailByMasking()); + VPlanTransforms::createLoopRegions(*Plan); VPlanTransforms::createExtractsForLiveOuts(*Plan); @@ -8926,11 +8925,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { assert(!OrigLoop->isInnermost()); assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI); - VPlanTransforms::prepareForVectorization( - *Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop, - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), false, - Range); + auto Plan = VPlanTransforms::buildVPlan0( + OrigLoop, *LI, Legal->getWidestInductionType(), + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE); + VPlanTransforms::handleEarlyExits(*Plan, + /*HasUncountableExit*/ false, Range); + VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true, + /*TailFolded*/ false); + VPlanTransforms::createLoopRegions(*Plan); for (ElementCount VF : Range) diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 7e8eff31c1fd3..b231a8429503f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -336,12 +336,6 @@ std::unique_ptr PlainCFGBuilder::buildPlainCFG() { return std::move(Plan); } -std::unique_ptr VPlanTransforms::buildPlainCFG(Loop *TheLoop, - LoopInfo &LI) { - PlainCFGBuilder Builder(TheLoop, &LI); - return Builder.buildPlainCFG(); -} - /// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it /// has exactly 2 predecessors (preheader and latch), where the block /// dominates the latch and the preheader dominates the block. If it is a @@ -457,10 +451,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB, LatchDL); } -void VPlanTransforms::prepareForVectorization( - VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE, - bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop, - DebugLoc IVDL, bool HasUncountableEarlyExit, VFRange &Range) { +static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL, + PredicatedScalarEvolution &PSE, Loop *TheLoop) { VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -486,12 +478,54 @@ void VPlanTransforms::prepareForVectorization( addCanonicalIVRecipes(Plan, HeaderVPBB, LatchVPBB, InductionTy, IVDL); - [[maybe_unused]] bool HandledUncountableEarlyExit = false; + // Create SCEV and VPValue for the trip count. + // We use the symbolic max backedge-taken-count, which works also when + // vectorizing loops with uncountable early exits. + const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount(); + assert(!isa(BackedgeTakenCountSCEV) && + "Invalid backedge-taken count"); + ScalarEvolution &SE = *PSE.getSE(); + const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV, + InductionTy, TheLoop); + Plan.setTripCount( + vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE)); + + VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph"); + VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader()); + + // The connection order corresponds to the operands of the conditional branch, + // with the middle block already connected to the exit block. + VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); + // Also connect the entry block to the scalar preheader. + // TODO: Also introduce a branch recipe together with the minimum trip count + // check. + VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH); + Plan.getEntry()->swapSuccessors(); +} + +std::unique_ptr +VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, + DebugLoc IVDL, PredicatedScalarEvolution &PSE) { + PlainCFGBuilder Builder(TheLoop, &LI); + std::unique_ptr VPlan0 = Builder.buildPlainCFG(); + addInitialSkeleton(*VPlan0, InductionTy, IVDL, PSE, TheLoop); + return VPlan0; +} + +void VPlanTransforms::handleEarlyExits(VPlan &Plan, + bool HasUncountableEarlyExit, + VFRange &Range) { + auto *MiddleVPBB = cast( + Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]); + auto *LatchVPBB = cast(MiddleVPBB->getSinglePredecessor()); + VPBlockBase *HeaderVPB = cast(LatchVPBB->getSuccessors()[1]); + // Disconnect all early exits from the loop leaving it with a single exit from // the latch. Early exits that are countable are left for a scalar epilog. The // condition of uncountable early exits (currently at most one is supported) // is fused into the latch exit, and used to branch from middle block to the // early exit destination. + [[maybe_unused]] bool HandledUncountableEarlyExit = false; for (VPIRBasicBlock *EB : Plan.getExitBlocks()) { for (VPBlockBase *Pred : to_vector(EB->getPredecessors())) { if (Pred == MiddleVPBB) @@ -500,7 +534,8 @@ void VPlanTransforms::prepareForVectorization( assert(!HandledUncountableEarlyExit && "can handle exactly one uncountable early exit"); handleUncountableEarlyExit(cast(Pred), EB, Plan, - HeaderVPBB, LatchVPBB, Range); + cast(HeaderVPB), LatchVPBB, + Range); HandledUncountableEarlyExit = true; } else { for (VPRecipeBase &R : EB->phis()) @@ -513,36 +548,18 @@ void VPlanTransforms::prepareForVectorization( assert((!HasUncountableEarlyExit || HandledUncountableEarlyExit) && "missed an uncountable exit that must be handled"); +} - // Create SCEV and VPValue for the trip count. - // We use the symbolic max backedge-taken-count, which works also when - // vectorizing loops with uncountable early exits. - const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount(); - assert(!isa(BackedgeTakenCountSCEV) && - "Invalid loop count"); - ScalarEvolution &SE = *PSE.getSE(); - const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV, - InductionTy, TheLoop); - Plan.setTripCount( - vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE)); - - VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph"); - VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader()); - - // The connection order corresponds to the operands of the conditional branch, - // with the middle block already connected to the exit block. - VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); - // Also connect the entry block to the scalar preheader. - // TODO: Also introduce a branch recipe together with the minimum trip count - // check. - VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH); - Plan.getEntry()->swapSuccessors(); - +void VPlanTransforms::addMiddleCheck(VPlan &Plan, + bool RequiresScalarEpilogueCheck, + bool TailFolded) { + auto *MiddleVPBB = cast( + Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]); // If MiddleVPBB has a single successor then the original loop does not exit // via the latch and the single successor must be the scalar preheader. // There's no need to add a runtime check to MiddleVPBB. if (MiddleVPBB->getNumSuccessors() == 1) { - assert(MiddleVPBB->getSingleSuccessor() == ScalarPH && + assert(MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader() && "must have ScalarPH as single successor"); return; } @@ -564,6 +581,7 @@ void VPlanTransforms::prepareForVectorization( // the corresponding compare because they may have ended up with different // line numbers and we want to avoid awkward line stepping while debugging. // E.g., if the compare has got a line number inside the loop. + auto *LatchVPBB = cast(MiddleVPBB->getSinglePredecessor()); DebugLoc LatchDL = LatchVPBB->getTerminator()->getDebugLoc(); VPBuilder Builder(MiddleVPBB); VPValue *Cmp; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 2afe956a8917e..e49137cbaada3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -54,21 +54,30 @@ struct VPlanTransforms { verifyVPlanIsValid(Plan); } - LLVM_ABI_FOR_TEST static std::unique_ptr buildPlainCFG(Loop *TheLoop, - LoopInfo &LI); - - /// Prepare the plan for vectorization. It will introduce a dedicated - /// VPBasicBlock for the vector pre-header as well as a VPBasicBlock as exit - /// block of the main vector loop (middle.block). If a check is needed to - /// guard executing the scalar epilogue loop, it will be added to the middle - /// block, together with VPBasicBlocks for the scalar preheader and exit - /// blocks. \p InductionTy is the type of the canonical induction and used for - /// related values, like the trip count expression. It also creates a VPValue - /// expression for the original trip count. - LLVM_ABI_FOR_TEST static void prepareForVectorization( - VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE, - bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop, - DebugLoc IVDL, bool HasUncountableExit, VFRange &Range); + /// Create a base VPlan0, serving as the common starting point for all later + /// candidates. It consists of an initial plain CFG loop with loop blocks from + /// \p TheLoop being directly translated to VPBasicBlocks with VPInstruction + /// corresponding to the input IR. + /// + /// The created loop is wrapped in an initial skeleton to facilitate + /// vectorization, consisting of a vector pre-header, an exit block for the + /// main vector loop (middle.block) and a new block as preheader of the scalar + /// loop (scalar.ph). It also adds a canonical IV and its increment, using \p + /// InductionTy and \p IVDL, and creates a VPValue expression for the original + /// trip count. + LLVM_ABI_FOR_TEST static std::unique_ptr + buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, + PredicatedScalarEvolution &PSE); + + /// Update \p Plan to account for all early exits. + LLVM_ABI_FOR_TEST static void + handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range); + + /// If a check is needed to guard executing the scalar epilogue loop, it will + /// be added to the middle block. + LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan, + bool RequiresScalarEpilogueCheck, + bool TailFolded); /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's /// flat CFG into a hierarchical CFG. diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h index 7dfd11a48b595..56f685801151a 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h +++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h @@ -72,10 +72,13 @@ class VPlanTestIRBase : public testing::Test { Loop *L = LI->getLoopFor(LoopHeader); PredicatedScalarEvolution PSE(*SE, *L); - auto Plan = VPlanTransforms::buildPlainCFG(L, *LI); + auto Plan = VPlanTransforms::buildVPlan0(L, *LI, IntegerType::get(*Ctx, 64), + {}, PSE); + VFRange R(ElementCount::getFixed(1), ElementCount::getFixed(2)); - VPlanTransforms::prepareForVectorization(*Plan, IntegerType::get(*Ctx, 64), - PSE, true, false, L, {}, false, R); + VPlanTransforms::handleEarlyExits(*Plan, false, R); + VPlanTransforms::addMiddleCheck(*Plan, true, false); + VPlanTransforms::createLoopRegions(*Plan); return Plan; }