diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ecdba9d1a1032..f466c2c3d3f80 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2380,10 +2380,12 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { // We just connected a new block to the scalar preheader. Update all // VPPhis by adding an incoming value for it, replicating the last value. + unsigned NumPredecessors = ScalarPH->getNumPredecessors(); for (VPRecipeBase &R : cast(ScalarPH)->phis()) { - auto *ResumePhi = cast(&R); - ResumePhi->addOperand( - ResumePhi->getOperand(ResumePhi->getNumOperands() - 1)); + assert(isa(&R) && "Phi expected to be VPPhi"); + assert(cast(&R)->getNumIncoming() == NumPredecessors - 1 && + "must have incoming values for all operands"); + R.addOperand(R.getOperand(NumPredecessors - 2)); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6a49d07e4cb2c..cbbc006e68212 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1136,6 +1136,10 @@ class VPPhiAccessors { return getAsRecipe()->getNumOperands(); } + /// Removes the incoming value for \p IncomingBlock, which must be a + /// predecessor. + void removeIncomingValueFor(VPBlockBase *IncomingBlock) const; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const; @@ -3548,14 +3552,13 @@ template <> struct CastIsPossible { }; /// Support casting from VPRecipeBase -> VPPhiAccessors, by down-casting to the /// recipe types implementing VPPhiAccessors. Used by cast<>, dyn_cast<> & co. -template <> -struct CastInfo - : public CastIsPossible { +template +struct CastInfoVPPhiAccessors : public CastIsPossible { - using Self = CastInfo; + using Self = CastInfo; /// doCast is used by cast<>. - static inline VPPhiAccessors *doCast(const VPRecipeBase *R) { + static inline VPPhiAccessors *doCast(SrcTy R) { return const_cast([R]() -> const VPPhiAccessors * { switch (R->getVPDefID()) { case VPDef::VPInstructionSC: @@ -3571,12 +3574,18 @@ struct CastInfo } /// doCastIfPossible is used by dyn_cast<>. - static inline VPPhiAccessors *doCastIfPossible(const VPRecipeBase *f) { + static inline VPPhiAccessors *doCastIfPossible(SrcTy f) { if (!Self::isPossible(f)) return nullptr; return doCast(f); } }; +template <> +struct CastInfo + : CastInfoVPPhiAccessors {}; +template <> +struct CastInfo + : CastInfoVPPhiAccessors {}; /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It /// holds a sequence of zero or more VPRecipe's each representing a sequence of diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index b92ea757060cb..593e5063802ba 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -501,8 +501,10 @@ void VPlanTransforms::prepareForVectorization( cast(HeaderVPB), cast(LatchVPB), Range); HandledUncountableEarlyExit = true; + } else { + for (VPRecipeBase &R : EB->phis()) + cast(&R)->removeIncomingValueFor(Pred); } - cast(Pred)->getTerminator()->eraseFromParent(); VPBlockUtils::disconnectBlocks(Pred, EB); } @@ -526,32 +528,6 @@ void VPlanTransforms::prepareForVectorization( VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph"); VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader()); - // If needed, add a check in the middle block to see if we have completed - // all of the iterations in the first vector loop. Three cases: - // 1) If we require a scalar epilogue, there is no conditional branch as - // we unconditionally branch to the scalar preheader. Remove the recipes - // from the exit blocks. - // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. - // Thus if tail is to be folded, we know we don't need to run the - // remainder and we can set the condition to true. - // 3) Otherwise, construct a runtime check. - - if (!RequiresScalarEpilogueCheck) { - if (auto *LatchExitVPB = MiddleVPBB->getSingleSuccessor()) - VPBlockUtils::disconnectBlocks(MiddleVPBB, LatchExitVPB); - VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); - VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH); - Plan.getEntry()->swapSuccessors(); - - // The exit blocks are unreachable, remove their recipes to make sure no - // users remain that may pessimize transforms. - for (auto *EB : Plan.getExitBlocks()) { - for (VPRecipeBase &R : make_early_inc_range(*EB)) - R.eraseFromParent(); - } - return; - } - // The connection order corresponds to the operands of the conditional branch, // with the middle block already connected to the exit block. VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); @@ -561,21 +537,45 @@ void VPlanTransforms::prepareForVectorization( VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH); Plan.getEntry()->swapSuccessors(); - auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator(); - // Here we use the same DebugLoc as the scalar loop latch terminator instead - // of the corresponding compare because they may have ended up with - // different line numbers and we want to avoid awkward line stepping while - // debugging. Eg. if the compare has got a line number inside the loop. + // If MiddleVPBB has a single successor then the original loop does not exit + // via the latch and the single successor must be the scalar preheader. + // There's no need to add a runtime check to MiddleVPBB. + if (MiddleVPBB->getNumSuccessors() == 1) { + assert(MiddleVPBB->getSingleSuccessor() == ScalarPH && + "must have ScalarPH as single successor"); + return; + } + + assert(MiddleVPBB->getNumSuccessors() == 2 && "must have 2 successors"); + + // Add a check in the middle block to see if we have completed all of the + // iterations in the first vector loop. + // + // Three cases: + // 1) If we require a scalar epilogue, the scalar ph must execute. Set the + // condition to false. + // 2) If (N - N%VF) == N, then we *don't* need to run the + // remainder. Thus if tail is to be folded, we know we don't need to run + // the remainder and we can set the condition to true. + // 3) Otherwise, construct a runtime check. + + // We use the same DebugLoc as the scalar loop latch terminator instead of + // the corresponding compare because they may have ended up with different + // line numbers and we want to avoid awkward line stepping while debugging. + // E.g., if the compare has got a line number inside the loop. + DebugLoc LatchDL = TheLoop->getLoopLatch()->getTerminator()->getDebugLoc(); VPBuilder Builder(MiddleVPBB); - VPValue *Cmp = - TailFolded - ? Plan.getOrAddLiveIn(ConstantInt::getTrue( - IntegerType::getInt1Ty(TripCount->getType()->getContext()))) - : Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(), - &Plan.getVectorTripCount(), - ScalarLatchTerm->getDebugLoc(), "cmp.n"); - Builder.createNaryOp(VPInstruction::BranchOnCond, {Cmp}, - ScalarLatchTerm->getDebugLoc()); + VPValue *Cmp; + if (!RequiresScalarEpilogueCheck) + Cmp = Plan.getOrAddLiveIn(ConstantInt::getFalse( + IntegerType::getInt1Ty(TripCount->getType()->getContext()))); + else if (TailFolded) + Cmp = Plan.getOrAddLiveIn(ConstantInt::getTrue( + IntegerType::getInt1Ty(TripCount->getType()->getContext()))); + else + Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(), + &Plan.getVectorTripCount(), LatchDL, "cmp.n"); + Builder.createNaryOp(VPInstruction::BranchOnCond, {Cmp}, LatchDL); } void VPlanTransforms::createLoopRegions(VPlan &Plan) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4ee2fdecf2813..12f4602119a3c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1185,6 +1185,14 @@ void VPIRPhi::execute(VPTransformState &State) { State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator())); } +void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const { + VPRecipeBase *R = const_cast(getAsRecipe()); + assert(R->getNumOperands() == R->getParent()->getNumPredecessors() && + "Number of phi operands must match number of predecessors"); + unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock); + R->removeOperand(Position); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPPhiAccessors::printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 6ea61c43f9528..5c8849be3d23e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1841,40 +1841,37 @@ void VPlanTransforms::truncateToMinimalBitwidths( } } -/// Remove BranchOnCond recipes with true conditions together with removing -/// dead edges to their successors. -static void removeBranchOnCondTrue(VPlan &Plan) { +/// Remove BranchOnCond recipes with true or false conditions together with +/// removing dead edges to their successors. +static void removeBranchOnConst(VPlan &Plan) { using namespace llvm::VPlanPatternMatch; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_shallow(Plan.getEntry()))) { + VPValue *Cond; if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() || - !match(&VPBB->back(), m_BranchOnCond(m_True()))) + !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond)))) continue; - VPBasicBlock *RemovedSucc = cast(VPBB->getSuccessors()[1]); - unsigned DeadIdx = RemovedSucc->getIndexForPredecessor(VPBB); - - // Values coming from VPBB into ResumePhi recipes of RemoveSucc are removed - // from these recipes. - for (VPRecipeBase &R : make_early_inc_range(*RemovedSucc)) { - assert((!isa(&R) || - !isa(cast(&R)->getInstruction())) && - !isa(&R) && - "Cannot update VPIRInstructions wrapping phis or header phis yet"); - auto *VPI = dyn_cast(&R); - if (!VPI) - break; - VPBuilder B(VPI); - SmallVector NewOperands; - // Create new operand list, with the dead incoming value filtered out. - for (const auto &[Idx, Op] : enumerate(VPI->operands())) { - if (Idx == DeadIdx) - continue; - NewOperands.push_back(Op); - } - VPI->replaceAllUsesWith( - B.createScalarPhi(NewOperands, VPI->getDebugLoc(), VPI->getName())); - VPI->eraseFromParent(); + unsigned RemovedIdx; + if (match(Cond, m_True())) + RemovedIdx = 1; + else if (match(Cond, m_False())) + RemovedIdx = 0; + else + continue; + + VPBasicBlock *RemovedSucc = + cast(VPBB->getSuccessors()[RemovedIdx]); + const auto &Preds = RemovedSucc->getPredecessors(); + assert(count(Preds, VPBB) == 1 && + "There must be a single edge between VPBB and its successor"); + // Values coming from VPBB into phi recipes of RemoveSucc are removed from + // these recipes. + for (VPRecipeBase &R : RemovedSucc->phis()) { + auto *Phi = cast(&R); + assert((!isa(&R) || RemovedSucc->getNumPredecessors() == 1) && + "VPIRPhis must have a single predecessor"); + Phi->removeIncomingValueFor(VPBB); } // Disconnect blocks and remove the terminator. RemovedSucc will be deleted // automatically on VPlan destruction if it becomes unreachable. @@ -1894,7 +1891,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(legalizeAndOptimizeInductions, Plan); runPass(removeRedundantExpandSCEVRecipes, Plan); runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); - runPass(removeBranchOnCondTrue, Plan); + runPass(removeBranchOnConst, Plan); runPass(removeDeadRecipes, Plan); runPass(createAndOptimizeReplicateRegions, Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 01e8bf78ef04c..ad347ed6f50f8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -38,6 +38,7 @@ class VPSlotTracker; class VPUser; class VPRecipeBase; class VPInterleaveRecipe; +class VPPhiAccessors; // This is the base class of the VPlan Def/Use graph, used for modeling the data // flow into, within and out of the VPlan. VPValues can stand for live-ins @@ -199,8 +200,18 @@ raw_ostream &operator<<(raw_ostream &OS, const VPRecipeBase &R); /// This class augments VPValue with operands which provide the inverse def-use /// edges from VPValue's users to their defs. class VPUser { + /// Grant access to removeOperand for VPPhiAccessors, the only supported user. + friend class VPPhiAccessors; + SmallVector Operands; + /// Removes the operand at index \p Idx. This also removes the VPUser from the + /// use-list of the operand. + void removeOperand(unsigned Idx) { + getOperand(Idx)->removeUser(*this); + Operands.erase(Operands.begin() + Idx); + } + protected: #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the operands to \p O.