diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 11bb34e5ba702..407692dcbc2ad 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -520,21 +520,23 @@ class LoopVectorizationPlanner { unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, InstructionCost LoopCost); + /// Returns a mapping of SCEVs to their expanded IR values. + /// Note that this is a temporary workaround needed due to the current + /// epilogue handling. + DenseMap prepareToExecute(VPlan &Plan, ElementCount VF, + unsigned UF, + GeneratedRTChecks &RTChecks, + bool VectorizingEpilogue); + /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan /// according to the best selected \p VF and \p UF. /// /// TODO: \p VectorizingEpilogue indicates if the executed VPlan is for the /// epilogue vector loop. It should be removed once the re-use issue has been /// fixed. - /// - /// Returns a mapping of SCEVs to their expanded IR values. - /// Note that this is a temporary workaround needed due to the current - /// epilogue handling. - DenseMap executePlan(ElementCount VF, unsigned UF, - VPlan &BestPlan, - InnerLoopVectorizer &LB, - DominatorTree *DT, - bool VectorizingEpilogue); + void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, + InnerLoopVectorizer &LB, DominatorTree *DT, + bool VectorizingEpilogue); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printPlans(raw_ostream &O); @@ -568,6 +570,10 @@ class LoopVectorizationPlanner { void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount) const; + /// Attach the runtime checks of \p RTChecks to \p Plan. + void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks, + bool HasBranchWeights) const; + protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is @@ -606,10 +612,6 @@ class LoopVectorizationPlanner { VPRecipeBuilder &RecipeBuilder, ElementCount MinVF); - /// Attach the runtime checks of \p RTChecks to \p Plan. - void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks, - bool HasBranchWeights) const; - #ifndef NDEBUG /// \return The most profitable vectorization factor for the available VPlans /// and the cost of that VF. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1b1797ab30a35..bca278cbeecf8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4059,7 +4059,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPScalarIVStepsSC: case VPDef::VPReplicateSC: case VPDef::VPInstructionSC: - case VPDef::VPCanonicalIVPHISC: case VPDef::VPVectorPointerSC: case VPDef::VPVectorEndPointerSC: case VPDef::VPExpandSCEVSC: @@ -7185,63 +7184,61 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock)); } -DenseMap LoopVectorizationPlanner::executePlan( - ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, - InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) { - assert(BestVPlan.hasVF(BestVF) && - "Trying to execute plan with unsupported VF"); - assert(BestVPlan.hasUF(BestUF) && - "Trying to execute plan with unsupported UF"); - if (BestVPlan.hasEarlyExit()) +DenseMap LoopVectorizationPlanner::prepareToExecute( + VPlan &Plan, ElementCount VF, unsigned UF, GeneratedRTChecks &RTChecks, + bool VectorizingEpilogue) { + assert(Plan.hasVF(VF) && "Trying to execute plan with unsupported VF"); + assert(Plan.hasUF(UF) && "Trying to execute plan with unsupported UF"); + if (Plan.hasEarlyExit()) ++LoopsEarlyExitVectorized; // TODO: Move to VPlan transform stage once the transition to the VPlan-based // cost model is complete for better cost estimates. - VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF); - VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan); - VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); - VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); + VPlanTransforms::runPass(VPlanTransforms::unrollByUF, Plan, UF); + VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, Plan); + VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, Plan); + VPlanTransforms::runPass(VPlanTransforms::replicateByVF, Plan, VF); bool HasBranchWeights = hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()); if (HasBranchWeights) { std::optional VScale = CM.getVScaleForTuning(); VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator, - BestVPlan, BestVF, VScale); - } - - if (!VectorizingEpilogue) { - // Checks are the same for all VPlans, added to BestVPlan only for - // compactness. - attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights); + Plan, VF, VScale); } // Retrieving VectorPH now when it's easier while VPlan still has Regions. - VPBasicBlock *VectorPH = cast(BestVPlan.getVectorPreheader()); + VPBasicBlock *VectorPH = cast(Plan.getVectorPreheader()); - VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); - VPlanTransforms::simplifyRecipes(BestVPlan); - VPlanTransforms::removeBranchOnConst(BestVPlan); + VPlanTransforms::optimizeForVFAndUF(Plan, VF, UF, PSE); + VPlanTransforms::simplifyRecipes(Plan); + VPlanTransforms::removeBranchOnConst(Plan); VPlanTransforms::narrowInterleaveGroups( - BestVPlan, BestVF, + Plan, VF, TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); - VPlanTransforms::removeDeadRecipes(BestVPlan); + VPlanTransforms::removeDeadRecipes(Plan); - VPlanTransforms::convertToConcreteRecipes(BestVPlan); + VPlanTransforms::convertToConcreteRecipes(Plan); // Regions are dissolved after optimizing for VF and UF, which completely // removes unneeded loop regions first. - VPlanTransforms::dissolveLoopRegions(BestVPlan); + VPlanTransforms::dissolveLoopRegions(Plan); // Canonicalize EVL loops after regions are dissolved. - VPlanTransforms::canonicalizeEVLLoops(BestVPlan); - VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH); + VPlanTransforms::canonicalizeEVLLoops(Plan); + VPlanTransforms::materializeBackedgeTakenCount(Plan, VectorPH); VPlanTransforms::materializeVectorTripCount( - BestVPlan, VectorPH, CM.foldTailByMasking(), - CM.requiresScalarEpilogue(BestVF.isVector())); - VPlanTransforms::materializeVFAndVFxUF(BestVPlan, VectorPH, BestVF); - VPlanTransforms::simplifyRecipes(BestVPlan); + Plan, VectorPH, CM.foldTailByMasking(), + CM.requiresScalarEpilogue(VF.isVector())); + VPlanTransforms::materializeVFAndVFxUF(Plan, VectorPH, VF); + VPlanTransforms::simplifyRecipes(Plan); // 0. Generate SCEV-dependent code in the entry, including TripCount, before // making any changes to the CFG. - DenseMap ExpandedSCEVs = - VPlanTransforms::expandSCEVs(BestVPlan, *PSE.getSE()); + return VPlanTransforms::expandSCEVs(Plan, *PSE.getSE()); +} + +void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, + VPlan &BestVPlan, + InnerLoopVectorizer &ILV, + DominatorTree *DT, + bool VectorizingEpilogue) { if (!ILV.getTripCount()) ILV.setTripCount(BestVPlan.getTripCount()->getLiveInIRValue()); else @@ -7382,8 +7379,6 @@ DenseMap LoopVectorizationPlanner::executePlan( ILV.fixVectorizedLoop(State); ILV.printDebugTracesAtEnd(); - - return ExpandedSCEVs; } //===--------------------------------------------------------------------===// @@ -8591,6 +8586,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( m_Specific(Plan->getCanonicalIV()), m_VPValue())) && "Did not find the canonical IV increment"); cast(IVInc)->dropPoisonGeneratingFlags(); + Plan->getCanonicalIV()->dropNUW(); } // --------------------------------------------------------------------------- @@ -8654,8 +8650,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // latter are added above for masking. // FIXME: Migrate code relying on the underlying instruction from VPlan0 // to construct recipes below to not use the underlying instruction. - if (isa( - &R) || + if (isa(&R) || (isa(&R) && !UnderlyingValue)) continue; @@ -8876,8 +8871,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, Builder, BlockMaskCache, nullptr /*LVer*/); for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - if (isa(&R)) - continue; auto *HeaderR = cast(&R); RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR); } @@ -9421,7 +9414,10 @@ static bool processLoopInVPlanNativePath( << L->getHeader()->getParent()->getName() << "\"\n"); LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1, VF.MinProfitableTripCount); - + // Checks are the same for all VPlans, added to Plan only for + // compactness. + LVP.attachRuntimeChecks(BestPlan, Checks, hasBranchWeightMD(*L->getLoopLatch()->getTerminator())); + LVP.prepareToExecute(BestPlan, VF.Width, /*UF=*/1, Checks, false); LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false); } @@ -9629,8 +9625,6 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { SmallPtrSet EpiWidenedPhis; for (VPRecipeBase &R : EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - if (isa(&R)) - continue; EpiWidenedPhis.insert( cast(R.getVPSingleValue()->getUnderlyingValue())); } @@ -9719,51 +9713,49 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, DenseMap ToFrozen; // Ensure that the start values for all header phi recipes are updated before // vectorizing the epilogue loop. - for (VPRecipeBase &R : Header->phis()) { - if (auto *IV = dyn_cast(&R)) { - // When vectorizing the epilogue loop, the canonical induction start - // value needs to be changed from zero to the value after the main - // vector loop. Find the resume value created during execution of the main - // VPlan. It must be the first phi in the loop preheader. - // FIXME: Improve modeling for canonical IV start values in the epilogue - // loop. - using namespace llvm::PatternMatch; - PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin(); - for (Value *Inc : EPResumeVal->incoming_values()) { - if (match(Inc, m_SpecificInt(0))) - continue; - assert(!EPI.VectorTripCount && - "Must only have a single non-zero incoming value"); - EPI.VectorTripCount = Inc; - } - // If we didn't find a non-zero vector trip count, all incoming values - // must be zero, which also means the vector trip count is zero. Pick the - // first zero as vector trip count. - // TODO: We should not choose VF * UF so the main vector loop is known to - // be dead. - if (!EPI.VectorTripCount) { - assert( - EPResumeVal->getNumIncomingValues() > 0 && - all_of(EPResumeVal->incoming_values(), - [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) && - "all incoming values must be 0"); - EPI.VectorTripCount = EPResumeVal->getOperand(0); - } - VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal); - assert(all_of(IV->users(), - [](const VPUser *U) { - return isa(U) || - isa(U) || - cast(U)->isScalarCast() || - cast(U)->getOpcode() == - Instruction::Add; - }) && - "the canonical IV should only be used by its increment or " - "ScalarIVSteps when resetting the start value"); - IV->setOperand(0, VPV); + auto *IV = Plan.getCanonicalIV(); + // When vectorizing the epilogue loop, the canonical induction start value + // needs to be changed from zero to the value after the main vector loop. Find + // the resume value created during execution of the main VPlan. + // FIXME: Improve modeling for canonical IV start values in the epilogue loop. + using namespace llvm::PatternMatch; + PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin(); + for (Value *Inc : EPResumeVal->incoming_values()) { + if (match(Inc, m_SpecificInt(0))) continue; - } + assert(!EPI.VectorTripCount && + "Must only have a single non-zero incoming value"); + EPI.VectorTripCount = Inc; + } + // If we didn't find a non-zero vector trip count, all incoming values + // must be zero, which also means the vector trip count is zero. Pick the + // first zero as vector trip count. + // TODO: We should not choose VF * UF so the main vector loop is known to + // be dead. + if (!EPI.VectorTripCount) { + assert(EPResumeVal->getNumIncomingValues() > 0 && + all_of(EPResumeVal->incoming_values(), + [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) && + "all incoming values must be 0"); + EPI.VectorTripCount = EPResumeVal->getOperand(0); + } + VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal); + assert(all_of(IV->users(), + [](const VPUser *U) { + return isa(U) || + isa(U) || + cast(U)->isScalarCast() || + cast(U)->getOpcode() == + Instruction::Add; + }) && + "the canonical IV should only be used by its increment or " + "ScalarIVSteps when resetting the start value"); + IV->setOperand(0, VPV); + + // Ensure that the start values for all header phi recipes are updated before + // vectorizing the epilogue loop. + for (VPRecipeBase &R : Header->phis()) { Value *ResumeV = nullptr; // TODO: Move setting of resume values to prepareToExecute. if (auto *ReductionPhi = dyn_cast(&R)) { @@ -10222,10 +10214,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { VPlanTransforms::runPass( VPlanTransforms::materializeConstantVectorTripCount, BestPlan, VF.Width, IC, PSE); + LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC, VF.MinProfitableTripCount); + LVP.attachRuntimeChecks(BestPlan, Checks, hasBranchWeightMD(*L->getLoopLatch()->getTerminator())); + LVP.prepareToExecute(BestPlan, VF.Width, IC, Checks, false); LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); - ORE->emit([&]() { return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), L->getHeader()) @@ -10252,8 +10246,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { BestEpiPlan); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, *BestMainPlan); - auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, - *BestMainPlan, MainILV, DT, false); + + LVP.attachRuntimeChecks(*BestMainPlan, Checks, hasBranchWeightMD(*L->getLoopLatch()->getTerminator())); + auto ExpandedSCEVs = LVP.prepareToExecute( + *BestMainPlan, EPI.MainLoopVF, EPI.MainLoopUF, Checks, false); + LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, + DT, false); ++LoopsVectorized; // Second pass vectorizes the epilogue and adjusts the control flow @@ -10263,6 +10261,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { EpilogILV.setTripCount(MainILV.getTripCount()); preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); + LVP.prepareToExecute(BestEpiPlan, EPI.EpilogueVF, EPI.EpilogueUF, + Checks, true); LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT, true); @@ -10293,7 +10293,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { VF.Width, IC, PSE); LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC, VF.MinProfitableTripCount); - + LVP.attachRuntimeChecks(BestPlan, Checks, hasBranchWeightMD(*L->getLoopLatch()->getTerminator())); + LVP.prepareToExecute(BestPlan, VF.Width, IC, Checks, false); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1438dc366b55d..d88e82cd442ec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -771,10 +771,13 @@ static std::pair cloneFrom(VPBlockBase *Entry) { VPRegionBlock *VPRegionBlock::clone() { const auto &[NewEntry, NewExiting] = cloneFrom(getEntry()); - auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting, - getName(), isReplicator()); + auto *NewRegion = + getPlan()->createVPRegionBlock(NewEntry, NewExiting, getName()); for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry)) Block->setParent(NewRegion); + + if (CanIV) + NewRegion->CanIV = CanIV->clone(); return NewRegion; } @@ -868,6 +871,11 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << (isReplicator() ? " " : " ") << getName() << ": {"; auto NewIndent = Indent + " "; + if (CanIV) { + O << '\n'; + CanIV->print(O, NewIndent, SlotTracker); + O << '\n'; + } for (auto *BlockBase : vp_depth_first_shallow(Entry)) { O << '\n'; BlockBase->print(O, NewIndent, SlotTracker); @@ -880,18 +888,38 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, void VPRegionBlock::dissolveToCFGLoop() { auto *Header = cast(getEntry()); - if (auto *CanIV = dyn_cast(&Header->front())) { - assert(this == getPlan()->getVectorLoopRegion() && - "Canonical IV must be in the entry of the top-level loop region"); - auto *ScalarR = VPBuilder(CanIV).createScalarPhi( - {CanIV->getStartValue(), CanIV->getBackedgeValue()}, - CanIV->getDebugLoc(), "index"); + auto *ExitingLatch = cast(getExiting()); + if (CanIV && CanIV->getNumUsers() > 0) { + auto *ExitingTerm = ExitingLatch->getTerminator(); + VPInstruction *CanIVInc = nullptr; + for (VPUser *U : CanIV->users()) { + VPValue *Inc; + if (!match(U, m_VPInstruction(m_Specific(CanIV), + m_VPValue(Inc)))) + continue; + auto *IncCand = cast(U); + if (Inc != &getPlan()->getVFxUF() && + (IncCand->getNumUsers() != 1 || + !match(*IncCand->user_begin(), + m_BranchOnCount(m_Specific(IncCand), m_VPValue())))) + continue; + assert(!CanIVInc && "Expecting at most one canonical IV increment"); + CanIVInc = cast(U); + } + if (!CanIVInc) { + CanIVInc = VPBuilder(ExitingTerm) + .createOverflowingOp(Instruction::Add, + {CanIV, &getPlan()->getVFxUF()}, + {CanIV->hasNoUnsignedWrap(), false}, + CanIV->getDebugLoc(), "index.next"); + } + auto *ScalarR = VPBuilder(Header, Header->begin()) + .createScalarPhi({CanIV->getStartValue(), CanIVInc}, + CanIV->getDebugLoc(), "index"); CanIV->replaceAllUsesWith(ScalarR); - CanIV->eraseFromParent(); } VPBlockBase *Preheader = getSinglePredecessor(); - auto *ExitingLatch = cast(getExiting()); VPBlockBase *Middle = getSingleSuccessor(); VPBlockUtils::disconnectBlocks(Preheader, this); VPBlockUtils::disconnectBlocks(this, Middle); @@ -928,7 +956,10 @@ VPlan::~VPlan() { for (unsigned I = 0, E = R.getNumOperands(); I != E; I++) R.setOperand(I, &DummyValue); } + } else if (auto *CanIV = cast(VPB)->getCanonicalIV()) { + CanIV->replaceAllUsesWith(&DummyValue); } + delete VPB; } for (VPValue *VPV : getLiveIns()) @@ -1212,6 +1243,11 @@ VPlan *VPlan::duplicate() { // else NewTripCount will be created and inserted into Old2NewVPValues when // TripCount is cloned. In any case NewPlan->TripCount is updated below. + if (auto *LoopRegion = getVectorLoopRegion()) { + Old2NewVPValues[LoopRegion->getCanonicalIV()] = + NewPlan->getVectorLoopRegion()->getCanonicalIV(); + } + remapOperands(Entry, NewEntry, Old2NewVPValues); // Initialize remaining fields of cloned VPlan. @@ -1392,6 +1428,8 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { /// Returns true if there is a vector loop region and \p VPV is defined in a /// loop region. static bool isDefinedInsideLoopRegions(const VPValue *VPV) { + if (isa(VPV)) + return true; const VPRecipeBase *DefR = VPV->getDefiningRecipe(); return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() || DefR->getParent()->getEnclosingLoopRegion()); @@ -1501,9 +1539,12 @@ void VPSlotTracker::assignNames(const VPlan &Plan) { ReversePostOrderTraversal> RPOT(VPBlockDeepTraversalWrapper(Plan.getEntry())); - for (const VPBasicBlock *VPBB : - VPBlockUtils::blocksOnly(RPOT)) - assignNames(VPBB); + for (const VPBlockBase *VPB : RPOT) { + if (auto *VPBB = dyn_cast(VPB)) { + assignNames(VPBB); + } else if (auto *CanIV = cast(VPB)->getCanonicalIV()) + assignName(CanIV); + } } void VPSlotTracker::assignNames(const VPBasicBlock *VPBB) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 8ed26f23c859b..f024c929d97d9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -547,7 +547,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenSelectSC: case VPRecipeBase::VPBlendSC: case VPRecipeBase::VPPredInstPHISC: - case VPRecipeBase::VPCanonicalIVPHISC: case VPRecipeBase::VPActiveLaneMaskPHISC: case VPRecipeBase::VPFirstOrderRecurrencePHISC: case VPRecipeBase::VPWidenPHISC: @@ -1927,12 +1926,6 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, /// the backedge is the second operand. /// /// Inductions are modeled using the following sub-classes: -/// * VPCanonicalIVPHIRecipe: Canonical scalar induction of the vector loop, -/// starting at a specified value (zero for the main vector loop, the resume -/// value for the epilogue vector loop) and stepping by 1. The induction -/// controls exiting of the vector loop by comparing against the vector trip -/// count. Produces a single scalar PHI for the induction value per -/// iteration. /// * VPWidenIntOrFpInductionRecipe: Generates vector values for integer and /// floating point inductions with arbitrary start and step values. Produces /// a vector PHI per-part. @@ -3285,61 +3278,51 @@ class VPExpandSCEVRecipe : public VPSingleDefRecipe { const SCEV *getSCEV() const { return Expr; } }; -/// Canonical scalar induction phi of the vector loop. Starting at the specified +/// Canonical scalar induction of the vector loop. Starting at the specified /// start value (either 0 or the resume value when vectorizing the epilogue -/// loop). VPWidenCanonicalIVRecipe represents the vector version of the -/// canonical induction variable. -class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { +/// loop). +class VPCanonicalIV : public VPValue, public VPUser { + bool HasNUW = true; + DebugLoc DL; + public: - VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL) - : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV, DL) {} + VPCanonicalIV(VPValue *StartV, bool HasNUW = true, + DebugLoc DL = DebugLoc::getUnknown()) + : VPValue(VPValue::VPCanonicalIVSC), VPUser(StartV), HasNUW(HasNUW), + DL(DL) {} - ~VPCanonicalIVPHIRecipe() override = default; + ~VPCanonicalIV() override = default; - VPCanonicalIVPHIRecipe *clone() override { - auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc()); - R->addOperand(getBackedgeValue()); - return R; + VPCanonicalIV *clone() { + return new VPCanonicalIV(getOperand(0), HasNUW, DL); } - VP_CLASSOF_IMPL(VPDef::VPCanonicalIVPHISC) + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPCanonicalIVSC; + } - void execute(VPTransformState &State) override { - llvm_unreachable("cannot execute this recipe, should be replaced by a " - "scalar phi recipe"); + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; } + void dropNUW() { HasNUW = false; } + bool hasNoUnsignedWrap() const { return HasNUW; } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; + VPSlotTracker &SlotTracker) const; #endif + VPValue *getStartValue() const { return getOperand(0); } + DebugLoc getDebugLoc() { return DL; } + /// Returns the scalar type of the induction. Type *getScalarType() const { return getStartValue()->getLiveInIRValue()->getType(); } - - /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { - assert(is_contained(operands(), Op) && - "Op must be an operand of the recipe"); - return true; - } - - /// Returns true if the recipe only uses the first part of operand \p Op. - bool onlyFirstPartUsed(const VPValue *Op) const override { - assert(is_contained(operands(), Op) && - "Op must be an operand of the recipe"); - return true; - } - - /// Return the cost of this VPCanonicalIVPHIRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override { - // For now, match the behavior of the legacy cost model. - return 0; - } }; /// A recipe for generating the active lane mask for the vector loop that is @@ -3420,14 +3403,13 @@ class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe { class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe, public VPUnrollPartAccessor<1> { public: - VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV) + VPWidenCanonicalIVRecipe(VPValue *CanonicalIV) : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {} ~VPWidenCanonicalIVRecipe() override = default; VPWidenCanonicalIVRecipe *clone() override { - return new VPWidenCanonicalIVRecipe( - cast(getOperand(0))); + return new VPWidenCanonicalIVRecipe(getOperand(0)); } VP_CLASSOF_IMPL(VPDef::VPWidenCanonicalIVSC) @@ -3466,7 +3448,7 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe { public: VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start, - VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step, + VPCanonicalIV *CanonicalIV, VPValue *Step, const Twine &Name = "") : VPDerivedIVRecipe( IndDesc.getKind(), @@ -3830,26 +3812,39 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase { /// VPRegionBlock. VPBlockBase *Exiting; - /// An indicator whether this region is to generate multiple replicated - /// instances of output IR corresponding to its VPBlockBases. - bool IsReplicator; + /// Canonical IV of the loop region. If nullptr, the region is a replication + /// region. + VPCanonicalIV *CanIV = nullptr; /// Use VPlan::createVPRegionBlock to create VPRegionBlocks. VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, - const std::string &Name = "", bool IsReplicator = false) + const std::string &Name = "") + : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting) { + assert(Entry->getPredecessors().empty() && "Entry block has predecessors."); + assert(Exiting->getSuccessors().empty() && "Exit block has successors."); + Entry->setParent(this); + Exiting->setParent(this); + } + + VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, VPCanonicalIV *CanIV, + const std::string &Name = "") : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting), - IsReplicator(IsReplicator) { + CanIV(CanIV) { assert(Entry->getPredecessors().empty() && "Entry block has predecessors."); assert(Exiting->getSuccessors().empty() && "Exit block has successors."); Entry->setParent(this); Exiting->setParent(this); } - VPRegionBlock(const std::string &Name = "", bool IsReplicator = false) + + VPRegionBlock(VPValue *StartV, DebugLoc DL, const std::string &Name = "") : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr), - IsReplicator(IsReplicator) {} + CanIV(new VPCanonicalIV(StartV, true, DL)) {} public: - ~VPRegionBlock() override {} + ~VPRegionBlock() override { + if (CanIV) + delete CanIV; + } /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPBlockBase *V) { @@ -3888,7 +3883,7 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase { /// An indicator whether this region is to generate multiple replicated /// instances of output IR corresponding to its VPBlockBases. - bool isReplicator() const { return IsReplicator; } + bool isReplicator() const { return !CanIV; } /// The method which generates the output IR instructions that correspond to /// this VPRegionBlock, thereby "executing" the VPlan. @@ -3916,6 +3911,11 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase { /// Remove the current region from its VPlan, connecting its predecessor to /// its entry, and its exiting block to its successor. void dissolveToCFGLoop(); + + /// Return the canonical induction variable of the region, null for + /// replicating regions. + VPCanonicalIV *getCanonicalIV() { return CanIV; } + const VPCanonicalIV *getCanonicalIV() const { return CanIV; } }; /// VPlan models a candidate for vectorization, encoding various decisions take @@ -4227,14 +4227,9 @@ class VPlan { LLVM_DUMP_METHOD void dump() const; #endif - /// Returns the canonical induction recipe of the vector loop. - VPCanonicalIVPHIRecipe *getCanonicalIV() { - VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock(); - if (EntryVPBB->empty()) { - // VPlan native path. - EntryVPBB = cast(EntryVPBB->getSingleSuccessor()); - } - return cast(&*EntryVPBB->begin()); + /// Returns the canonical induction VPValue of the vector loop. + VPCanonicalIV *getCanonicalIV() { + return getVectorLoopRegion()->getCanonicalIV(); } VPValue *getSCEVExpansion(const SCEV *S) const { @@ -4260,22 +4255,22 @@ class VPlan { return VPB; } - /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p - /// IsReplicator is true, the region is a replicate region. The returned block - /// is owned by the VPlan and deleted once the VPlan is destroyed. + /// Create a new replicate VPRegionBlock with \p Entry, \p Exiting and \p + /// Name. The returned block is owned by the VPlan and deleted once the VPlan + /// is destroyed. VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, - const std::string &Name = "", - bool IsReplicator = false) { - auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator); + const std::string &Name = "") { + auto *VPB = new VPRegionBlock(Entry, Exiting, Name); CreatedBlocks.push_back(VPB); return VPB; } - /// Create a new loop VPRegionBlock with \p Name and entry and exiting blocks set - /// to nullptr. The returned block is owned by the VPlan and deleted once the - /// VPlan is destroyed. - VPRegionBlock *createVPRegionBlock(const std::string &Name = "") { - auto *VPB = new VPRegionBlock(Name); + /// Create a new loop VPRegionBlock with \p StartV and \p Name, and entry and + /// exiting blocks set to nullptr. The returned block is owned by the VPlan + /// and deleted once the VPlan is destroyed. + VPRegionBlock *createVPRegionBlock(VPValue *StartV, DebugLoc DL, + const std::string &Name = "") { + auto *VPB = new VPRegionBlock(StartV, DL, Name); CreatedBlocks.push_back(VPB); return VPB; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 747c6623aa22a..5187771913944 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -10,6 +10,7 @@ #include "VPlan.h" #include "VPlanCFG.h" #include "VPlanDominatorTree.h" +#include "VPlanHelpers.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -23,11 +24,8 @@ using namespace llvm; VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) : Ctx(Plan.getContext()) { if (auto LoopRegion = Plan.getVectorLoopRegion()) { - if (const auto *CanIV = dyn_cast( - &LoopRegion->getEntryBasicBlock()->front())) { - CanonicalIVTy = CanIV->getScalarType(); - return; - } + CanonicalIVTy = LoopRegion->getCanonicalIV()->getScalarType(); + return; } // If there's no canonical IV, retrieve the type from the trip count @@ -269,18 +267,20 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { return CanonicalIVTy; } + if (auto *CanIV = dyn_cast(V)) + return CanonicalIVTy; + Type *ResultTy = TypeSwitch(V->getDefiningRecipe()) - .Case( - [this](const auto *R) { - // Handle header phi recipes, except VPWidenIntOrFpInduction - // which needs special handling due it being possibly truncated. - // TODO: consider inferring/caching type of siblings, e.g., - // backedge value, here and in cases below. - return inferScalarType(R->getStartValue()); - }) + .Case([this](const auto *R) { + // Handle header phi recipes, except VPWidenIntOrFpInduction + // which needs special handling due it being possibly truncated. + // TODO: consider inferring/caching type of siblings, e.g., + // backedge value, here and in cases below. + return inferScalarType(R->getStartValue()); + }) .Case( [](const auto *R) { return R->getScalarType(); }) .Case(R)) return RR->getVFScaleFactor(); if (auto *RR = dyn_cast(R)) @@ -422,15 +422,15 @@ SmallVector llvm::calculateRegisterUsageForPlan( const SmallPtrSetImpl &ValuesToIgnore) { // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the - // recipe that is the key. - using IntervalMap = SmallDenseMap; + // VPValue that is the key. + using IntervalMap = SmallDenseMap; // Maps indices to recipes. SmallVector Idx2Recipe; // Marks the end of each interval. IntervalMap EndPoint; - // Saves the list of recipe indices that are used in the loop. - SmallPtrSet Ends; + // Saves the list of VPValues that are used in the loop. + SmallPtrSet Ends; // Saves the list of values that are used in the loop but are defined outside // the loop (not including non-recipe values such as arguments and // constants). @@ -441,7 +441,7 @@ SmallVector llvm::calculateRegisterUsageForPlan( // each recipe. We use RPO to ensure that defs are met before their users. We // assume that each recipe that has in-loop users starts an interval. We // record every time that an in-loop value is used, so we have a list of the - // first and last occurrences of each recipe. + // first occurences of each recipe and last occurrence of each VPValue. VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); ReversePostOrderTraversal> RPOT( LoopRegion); @@ -459,43 +459,44 @@ SmallVector llvm::calculateRegisterUsageForPlan( // FIXME: Might need some motivation why these values are ignored. If // for example an argument is used inside the loop it will increase the // register pressure (so shouldn't we add it to LoopInvariants). - if (!DefR && (!U->getLiveInIRValue() || - !isa(U->getLiveInIRValue()))) + if (!isa(U) && !DefR && + (!U->getLiveInIRValue() || + !isa(U->getLiveInIRValue()))) continue; // If this recipe is outside the loop then record it and continue. - if (!DefR) { + if (!DefR && !isa(U)) { LoopInvariants.insert(U); continue; } // Overwrite previous end points. - EndPoint[DefR] = Idx2Recipe.size(); - Ends.insert(DefR); + EndPoint[U] = Idx2Recipe.size(); + Ends.insert(U); } } if (VPBB == LoopRegion->getExiting()) { // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the // exiting block, where their increment will get materialized eventually. for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) { - if (isa(&R)) { - EndPoint[&R] = Idx2Recipe.size(); - Ends.insert(&R); + if (auto *WideIV = dyn_cast(&R)) { + EndPoint[WideIV] = Idx2Recipe.size(); + Ends.insert(WideIV); } } } } // Saves the list of intervals that end with the index in 'key'. - using RecipeList = SmallVector; - SmallDenseMap TransposeEnds; + using VPValueList = SmallVector; + SmallDenseMap TransposeEnds; // Next, we transpose the EndPoints into a multi map that holds the list of // intervals that *end* at a specific location. for (auto &Interval : EndPoint) TransposeEnds[Interval.second].push_back(Interval.first); - SmallPtrSet OpenIntervals; + SmallPtrSet OpenIntervals; SmallVector RUs(VFs.size()); SmallVector, 8> MaxUsages(VFs.size()); @@ -512,6 +513,10 @@ SmallVector llvm::calculateRegisterUsageForPlan( return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; + if (auto *CanIV = LoopRegion->getCanonicalIV()) + if (CanIV->getNumUsers() != 0) + OpenIntervals.insert(CanIV); + // We scan the instructions linearly and record each time that a new interval // starts, by placing it in a set. If we find this value in TransposEnds then // we remove it from the set. The max register usage is the maximum register @@ -519,14 +524,16 @@ SmallVector llvm::calculateRegisterUsageForPlan( for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) { VPRecipeBase *R = Idx2Recipe[Idx]; - // Remove all of the recipes that end at this location. - RecipeList &List = TransposeEnds[Idx]; - for (VPRecipeBase *ToRemove : List) + // Remove all of the VPValues that end at this location. + VPValueList &List = TransposeEnds[Idx]; + for (VPValue *ToRemove : List) OpenIntervals.erase(ToRemove); // Ignore recipes that are never used within the loop and do not have side // effects. - if (!Ends.count(R) && !R->mayHaveSideEffects()) + if (all_of(R->definedValues(), + [&Ends](VPValue *Def) { return !Ends.count(Def); }) && + !R->mayHaveSideEffects()) continue; // Skip recipes for ignored values. @@ -546,41 +553,38 @@ SmallVector llvm::calculateRegisterUsageForPlan( // there is no previous entry for ClassID. SmallMapVector RegUsage; - for (auto *R : OpenIntervals) { - // Skip recipes that weren't present in the original loop. + for (auto *VPV : OpenIntervals) { + // Skip values that weren't present in the original loop. // TODO: Remove after removing the legacy // LoopVectorizationCostModel::calculateRegisterUsage if (isa(R)) + VPBranchOnMaskRecipe>(VPV)) continue; if (VFs[J].isScalar() || - isa(R) || - (isa(R) && - vputils::onlyScalarValuesUsed(cast(R))) || - (isa(R) && - (cast(R))->isInLoop())) { - unsigned ClassID = TTI.getRegisterClassForType( - false, TypeInfo.inferScalarType(R->getVPSingleValue())); + isa(VPV) || + (isa(VPV) && vputils::onlyScalarValuesUsed(VPV)) || + (isa(VPV) && + (cast(VPV))->isInLoop())) { + unsigned ClassID = + TTI.getRegisterClassForType(false, TypeInfo.inferScalarType(VPV)); // FIXME: The target might use more than one register for the type // even in the scalar case. RegUsage[ClassID] += 1; } else { // The output from scaled phis and scaled reductions actually has // fewer lanes than the VF. - unsigned ScaleFactor = getVFScaleFactor(R); + unsigned ScaleFactor = getVFScaleFactor(VPV); ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF << " for " << *R << "\n"; }); - for (VPValue *DefV : R->definedValues()) { - Type *ScalarTy = TypeInfo.inferScalarType(DefV); - unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy); - RegUsage[ClassID] += GetRegUsage(ScalarTy, VF); - } + Type *ScalarTy = TypeInfo.inferScalarType(VPV); + unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy); + RegUsage[ClassID] += GetRegUsage(ScalarTy, VF); } } @@ -593,8 +597,10 @@ SmallVector llvm::calculateRegisterUsageForPlan( LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " << OpenIntervals.size() << '\n'); - // Add the current recipe to the list of open intervals. - OpenIntervals.insert(R); + // Add the VPValues defined by the current recipe to the list of open + // intervals. + for (VPValue *DefV : R->definedValues()) + OpenIntervals.insert(DefV); } // We also search for instructions that are defined outside the loop, but are diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index c1c525ee23122..21485fea004b0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -392,7 +392,7 @@ static bool canonicalHeaderAndLatch(VPBlockBase *HeaderVPB, /// Create a new VPRegionBlock for the loop starting at \p HeaderVPB. static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) { auto *PreheaderVPBB = HeaderVPB->getPredecessors()[0]; - auto *LatchVPBB = HeaderVPB->getPredecessors()[1]; + auto *LatchVPBB = cast(HeaderVPB->getPredecessors()[1]); VPBlockUtils::disconnectBlocks(PreheaderVPBB, HeaderVPB); VPBlockUtils::disconnectBlocks(LatchVPBB, HeaderVPB); @@ -403,13 +403,24 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) { // LatchExitVPB, taking care to preserve the original predecessor & successor // order of blocks. Set region entry and exiting after both HeaderVPB and // LatchVPBB have been disconnected from their predecessors/successors. - auto *R = Plan.createVPRegionBlock(); + Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(&Plan.getVFxUF()); + VPPhi *ScalarCanIV = nullptr; + if (PreheaderVPBB->getSinglePredecessor() == Plan.getEntry()) + ScalarCanIV = cast(&*cast(HeaderVPB)->begin()); + auto *R = Plan.createVPRegionBlock( + Plan.getOrAddLiveIn(Constant::getNullValue(IVTy)), + ScalarCanIV ? ScalarCanIV->getDebugLoc() + : DebugLoc::getCompilerGenerated()); VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R); VPBlockUtils::disconnectBlocks(LatchVPBB, R); VPBlockUtils::connectBlocks(PreheaderVPBB, R); R->setEntry(HeaderVPB); R->setExiting(LatchVPBB); + if (ScalarCanIV) { + ScalarCanIV->replaceAllUsesWith(R->getCanonicalIV()); + ScalarCanIV->eraseFromParent(); + } // All VPBB's reachable shallowly from HeaderVPB belong to the current region. for (VPBlockBase *VPBB : vp_depth_first_shallow(HeaderVPB)) VPBB->setParent(R); @@ -422,9 +433,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB, DebugLoc DL) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddLiveIn(StartIdx); - - // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. - auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); + auto *CanonicalIVPHI = new VPPhi(StartV, DL); HeaderVPBB->insert(CanonicalIVPHI, HeaderVPBB->begin()); // We are about to replace the branch to exit the region. Remove the original @@ -437,14 +446,9 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB, } VPBuilder Builder(LatchVPBB); - // Add a VPInstruction to increment the scalar canonical IV by VF * UF. - // Initially the induction increment is guaranteed to not wrap, but that may - // change later, e.g. when tail-folding, when the flags need to be dropped. - auto *CanonicalIVIncrement = Builder.createOverflowingOp( + auto CanonicalIVIncrement = Builder.createOverflowingOp( Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {true, false}, DL, "index.next"); - CanonicalIVPHI->addOperand(CanonicalIVIncrement); - // Add the BranchOnCount VPInstruction to the latch. Builder.createNaryOp(VPInstruction::BranchOnCount, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, @@ -789,7 +793,7 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { VPReductionPHIRecipe *RedPhiR = nullptr; bool HasUnsupportedPhi = false; for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) { - if (isa(&R)) + if (isa(&R)) continue; auto *Cur = dyn_cast(&R); if (!Cur) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 18c2bef90f08f..fdd8a2144e9f4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -253,7 +253,6 @@ struct Recipe_match { auto *DefR = dyn_cast(R); // Check for recipes that do not have opcodes. if constexpr (std::is_same::value || - std::is_same::value || std::is_same::value || std::is_same::value) return DefR; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bd9a93ed57b8a..fb487e00eae25 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -66,7 +66,6 @@ bool VPRecipeBase::mayWriteToMemory() const { ->onlyReadsMemory(); case VPWidenIntrinsicSC: return cast(this)->mayWriteToMemory(); - case VPCanonicalIVPHISC: case VPBranchOnMaskSC: case VPFirstOrderRecurrencePHISC: case VPReductionPHISC: @@ -1192,6 +1191,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case Instruction::Select: case Instruction::PHI: case VPInstruction::AnyOf: + case VPInstruction::Broadcast: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: case VPInstruction::CalculateTripCountMinusVF: @@ -2309,7 +2309,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return false; auto *StepC = dyn_cast(getStepValue()->getLiveInIRValue()); auto *StartC = dyn_cast(getStartValue()->getLiveInIRValue()); - auto *CanIV = cast(&*getParent()->begin()); + auto *CanIV = getParent()->getParent()->getCanonicalIV(); return StartC && StartC->isZero() && StepC && StepC->isOne() && getScalarType() == CanIV->getScalarType(); } @@ -3878,11 +3878,11 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT "; +void VPCanonicalIV ::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent; printAsOperand(O, SlotTracker); - O << " = CANONICAL-INDUCTION "; + O << " = CANONICAL-IV "; printOperands(O, SlotTracker); } #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 6c5f9b7302292..e18ed89602099 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -367,8 +367,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, PredRecipe->eraseFromParent(); auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); - VPRegionBlock *Region = - Plan.createVPRegionBlock(Entry, Exiting, RegionName, true); + VPRegionBlock *Region = Plan.createVPRegionBlock(Entry, Exiting, RegionName); // Note: first set Entry as region entry and then connect successors starting // from it in order, to propagate the "parent" of each VPBasicBlock. @@ -496,7 +495,7 @@ static void removeRedundantInductionCasts(VPlan &Plan) { /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV /// recipe, if it exists. static void removeRedundantCanonicalIVs(VPlan &Plan) { - VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); + VPCanonicalIV *CanonicalIV = Plan.getCanonicalIV(); VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; for (VPUser *U : CanonicalIV->users()) { WidenNewIV = dyn_cast(U); @@ -578,7 +577,7 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder) { VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); + VPCanonicalIV *CanonicalIV = Plan.getCanonicalIV(); VPSingleDefRecipe *BaseIV = Builder.createDerivedIV( Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx"); @@ -1454,9 +1453,11 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, }); auto *CanIV = Plan.getCanonicalIV(); - if (!match(Cond, m_SpecificICmp(CmpInst::ICMP_EQ, - m_Specific(CanIV->getBackedgeValue()), - m_Specific(&Plan.getVectorTripCount())))) + if (!match(Cond, m_Binary( + m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())), + m_Specific(&Plan.getVectorTripCount()))) || + cast(Cond->getDefiningRecipe())->getPredicate() != + CmpInst::ICMP_EQ) return false; // The compare checks CanIV + VFxUF == vector trip count. The vector trip @@ -1518,8 +1519,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, if (all_of(Header->phis(), [](VPRecipeBase &Phi) { if (auto *R = dyn_cast(&Phi)) return R->isCanonical(); - return isa(&Phi); + return isa(&Phi); })) { for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) { if (auto *R = dyn_cast(&HeaderR)) { @@ -1534,6 +1535,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0)); HeaderR.eraseFromParent(); } + Plan.getCanonicalIV()->replaceAllUsesWith( + Plan.getCanonicalIV()->getStartValue()); VPBlockBase *Preheader = VectorRegion->getSinglePredecessor(); VPBlockBase *Exit = VectorRegion->getSingleSuccessor(); @@ -2011,15 +2014,16 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); - auto *CanonicalIVPHI = Plan.getCanonicalIV(); - VPValue *StartV = CanonicalIVPHI->getStartValue(); + auto *CanonicalIV = Plan.getCanonicalIV(); + VPValue *StartV = CanonicalIV->getStartValue(); auto *CanonicalIVIncrement = - cast(CanonicalIVPHI->getBackedgeValue()); + cast(EB->getTerminator()->getOperand(0)); // TODO: Check if dropping the flags is needed if // !DataAndControlFlowWithoutRuntimeCheck. CanonicalIVIncrement->dropPoisonGeneratingFlags(); - DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); + CanonicalIV->dropNUW(); + DebugLoc DL = CanonicalIV->getDebugLoc(); // We can't use StartV directly in the ActiveLaneMask VPInstruction, since // we have to take unrolling into account. Each part needs to start at // Part * VF @@ -2040,7 +2044,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // When avoiding a runtime check, the active.lane.mask inside the loop // uses a modified trip count and the induction variable increment is // done after the active.lane.mask intrinsic is called. - IncrementValue = CanonicalIVPHI; + IncrementValue = CanonicalIV; TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, {TC}, DL); } @@ -2056,7 +2060,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); - LaneMaskPhi->insertAfter(CanonicalIVPHI); + auto *HeaderVPBB = TopRegion->getEntryBasicBlock(); + LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin()); // Create the active lane mask for the next iteration of the loop before the // original terminator. @@ -2361,7 +2366,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and /// replaces all uses except the canonical IV increment of -/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe +/// VPCanonicalIV with a VPEVLBasedIVPHIRecipe. VPCanonicalIV /// is used only for loop iterations counting after this transformation. /// /// The function uses the following definitions: @@ -2406,13 +2411,17 @@ void VPlanTransforms::addExplicitVectorLength( VPlan &Plan, const std::optional &MaxSafeElements) { VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - auto *CanonicalIVPHI = Plan.getCanonicalIV(); - auto *CanIVTy = CanonicalIVPHI->getScalarType(); - VPValue *StartV = CanonicalIVPHI->getStartValue(); + auto *CanonicalIV = Plan.getCanonicalIV(); + auto *CanIVTy = CanonicalIV->getScalarType(); + VPValue *StartV = CanonicalIV->getStartValue(); + auto *CanonicalIVIncrement = cast(Plan.getVectorLoopRegion() + ->getExitingBasicBlock() + ->getTerminator() + ->getOperand(0)); // Create the ExplicitVectorLengthPhi recipe in the main loop. auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); - EVLPhi->insertAfter(CanonicalIVPHI); + EVLPhi->insertBefore(*Header, Header->begin()); VPBuilder Builder(Header, Header->getFirstNonPhi()); // Create the AVL (application vector length), starting from TC -> 0 in steps // of EVL. @@ -2430,8 +2439,6 @@ void VPlanTransforms::addExplicitVectorLength( auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL, DebugLoc()); - auto *CanonicalIVIncrement = - cast(CanonicalIVPHI->getBackedgeValue()); Builder.setInsertPoint(CanonicalIVIncrement); VPValue *OpVPEVL = VPEVL; @@ -2441,8 +2448,7 @@ void VPlanTransforms::addExplicitVectorLength( auto *NextEVLIV = Builder.createOverflowingOp( Instruction::Add, {OpVPEVL, EVLPhi}, - {CanonicalIVIncrement->hasNoUnsignedWrap(), - CanonicalIVIncrement->hasNoSignedWrap()}, + {CanonicalIV->hasNoUnsignedWrap(), false}, CanonicalIVIncrement->getDebugLoc(), "index.evl.next"); EVLPhi->addOperand(NextEVLIV); @@ -2453,10 +2459,10 @@ void VPlanTransforms::addExplicitVectorLength( transformRecipestoEVLRecipes(Plan, *VPEVL); - // Replace all uses of VPCanonicalIVPHIRecipe by + // Replace all uses of VPCanonicalIV by // VPEVLBasedIVPHIRecipe except for the canonical IV increment. - CanonicalIVPHI->replaceAllUsesWith(EVLPhi); - CanonicalIVIncrement->setOperand(0, CanonicalIVPHI); + CanonicalIV->replaceAllUsesWith(EVLPhi); + CanonicalIVIncrement->setOperand(0, CanonicalIV); // TODO: support unroll factor > 1. Plan.setUF(1); } @@ -2507,15 +2513,15 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { // Replace CanonicalIVInc with EVL-PHI increment. auto *CanonicalIV = cast(&*HeaderVPBB->begin()); VPValue *Backedge = CanonicalIV->getIncomingValue(1); - assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV), - m_Specific(&Plan.getVFxUF()))) && - "Unexpected canonical iv"); - Backedge->replaceAllUsesWith(EVLIncrement); + if (match(Backedge, + m_c_Add(m_Specific(CanonicalIV), m_Specific(&Plan.getVFxUF())))) { + Backedge->replaceAllUsesWith(EVLIncrement); - // Remove unused phi and increment. - VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe(); - CanonicalIVIncrement->eraseFromParent(); - CanonicalIV->eraseFromParent(); + // Remove unused phi and increment. + VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe(); + CanonicalIVIncrement->eraseFromParent(); + CanonicalIV->eraseFromParent(); + } // Replace the use of VectorTripCount in the latch-exiting block. // Before: (branch-on-count EVLIVInc, VectorTripCount) @@ -3623,8 +3629,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned FixedVF = VF.getFixedValue(); SmallVector StoreGroups; for (auto &R : *VectorLoop->getEntryBasicBlock()) { - if (isa(&R) || - match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) + if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) continue; if (isa(&R) && @@ -3774,7 +3779,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, // Adjust induction to reflect that the transformed plan only processes one // original iteration. auto *CanIV = Plan.getCanonicalIV(); - auto *Inc = cast(CanIV->getBackedgeValue()); + auto *Inc = cast( + VectorLoop->getExitingBasicBlock()->getTerminator()->getOperand(0)); Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get( CanIV->getScalarType(), 1 * Plan.getUF()))); Plan.getVF().replaceAllUsesWith( diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 7a63d20825a31..88276924b7759 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -79,7 +79,7 @@ class UnrollState { void unrollBlock(VPBlockBase *VPB); VPValue *getValueForPart(VPValue *V, unsigned Part) { - if (Part == 0 || V->isLiveIn()) + if (Part == 0 || V->isLiveIn() || isa(V)) return V; assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && "accessed value does not exist"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 700a733bf9f2c..8c96181017541 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -105,9 +105,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { return all_of(R->operands(), isUniformAcrossVFsAndUFs); } - auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV(); - // Canonical IV chain is uniform. - if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue()) + if (isa(V)) return true; return TypeSwitch(R) diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 24f6d61512ef6..d7faccaa70423 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -40,6 +40,7 @@ class VPUser; class VPRecipeBase; class VPInterleaveRecipe; class VPPhiAccessors; +class VPCanonicalIV; // This is the base class of the VPlan Def/Use graph, used for modeling the data // flow into, within and out of the VPlan. VPValues can stand for live-ins @@ -51,6 +52,7 @@ class LLVM_ABI_FOR_TEST VPValue { friend class VPInterleaveRecipe; friend class VPlan; friend class VPExpressionRecipe; + friend class VPCanonicalIV; const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -89,7 +91,9 @@ class LLVM_ABI_FOR_TEST VPValue { enum { VPValueSC, /// A generic VPValue, like live-in values or defined by a recipe /// that defines multiple values. - VPVRecipeSC /// A VPValue sub-class that is a VPRecipeBase. + VPVRecipeSC, /// A VPValue sub-class that is a VPRecipeBase. + VPCanonicalIVSC, /// A VPValue sub-class defines the canonical IV of a loop + /// region. }; VPValue(const VPValue &) = delete; @@ -166,7 +170,9 @@ class LLVM_ABI_FOR_TEST VPValue { bool hasDefiningRecipe() const { return getDefiningRecipe(); } /// Returns true if this VPValue is a live-in, i.e. defined outside the VPlan. - bool isLiveIn() const { return !hasDefiningRecipe(); } + bool isLiveIn() const { + return !hasDefiningRecipe() && SubclassID != VPCanonicalIVSC; + } /// Returns the underlying IR value, if this VPValue is defined outside the /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef @@ -361,7 +367,6 @@ class VPDef { VPPredInstPHISC, // START: SubclassID for recipes that inherit VPHeaderPHIRecipe. // VPHeaderPHIRecipe need to be kept together. - VPCanonicalIVPHISC, VPActiveLaneMaskPHISC, VPEVLBasedIVPHISC, VPFirstOrderRecurrencePHISC, @@ -371,7 +376,7 @@ class VPDef { // END: SubclassID for recipes that inherit VPHeaderPHIRecipe // END: Phi-like recipes VPFirstPHISC = VPWidenPHISC, - VPFirstHeaderPHISC = VPCanonicalIVPHISC, + VPFirstHeaderPHISC = VPActiveLaneMaskPHISC, VPLastHeaderPHISC = VPReductionPHISC, VPLastPHISC = VPReductionPHISC, }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index e25ffe135418e..dc0696a5ce844 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -458,12 +458,6 @@ bool VPlanVerifier::verify(const VPlan &Plan) { return false; } - if (!isa(&*Entry->begin())) { - errs() << "VPlan vector loop header does not start with a " - "VPCanonicalIVPHIRecipe\n"; - return false; - } - const VPBasicBlock *Exiting = dyn_cast(TopRegion->getExiting()); if (!Exiting) { errs() << "VPlan exiting block is not a VPBasicBlock\n"; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index aa3d81a19a6d2..4710fa197b871 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -579,10 +579,10 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; PRED-NEXT: [[TMP13:%.*]] = or [[BROADCAST_SPLAT]], splat (i16 1) ; PRED-NEXT: [[TMP14:%.*]] = uitofp [[TMP13]] to ; PRED-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr [[NEXT_GEP]], i32 8, [[ACTIVE_LANE_MASK]]) -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]]) ; PRED-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; PRED-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br [[EXIT:label %.*]] @@ -658,15 +658,15 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { ; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1 ; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; COMMON: [[PRED_STORE_CONTINUE12]]: -; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]] ; COMMON: [[PRED_STORE_IF13]]: ; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7 ; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE14]] -; COMMON: [[PRED_STORE_CONTINUE14]]: +; COMMON-NEXT: br label %[[EXIT]] +; COMMON: [[EXIT]]: ; COMMON-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; COMMON: [[MIDDLE_BLOCK]]: -; COMMON-NEXT: br [[EXIT:label %.*]] +; COMMON-NEXT: br [[EXIT1:label %.*]] ; COMMON: [[SCALAR_PH]]: ; entry: @@ -1052,11 +1052,11 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; PRED-NEXT: store float 0.000000e+00, ptr [[TMP83]], align 4 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE27]] ; PRED: [[PRED_STORE_CONTINUE27]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX]], i64 [[TMP17]]) ; PRED-NEXT: [[TMP84:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP85:%.*]] = xor i1 [[TMP84]], true ; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8) +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; PRED-NEXT: br i1 [[TMP85]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br [[EXIT:label %.*]] @@ -1347,10 +1347,10 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 { ; PRED-NEXT: [[TMP26:%.*]] = fptoui [[TMP25]] to ; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[C]], i64 [[INDEX]] ; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP26]], ptr [[TMP27]], i32 1, [[ACTIVE_LANE_MASK]]) -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]]) ; PRED-NEXT: [[TMP28:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP29:%.*]] = xor i1 [[TMP28]], true +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br [[EXIT:label %.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll index a44cc09b8a8ea..65b2c348607ca 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll @@ -140,11 +140,11 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i ; CHECK-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64 ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP33]] ; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( zeroinitializer, ptr [[TMP34]], i32 8, [[TMP23]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]]) ; CHECK-NEXT: [[TMP35:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] @@ -258,11 +258,11 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) { ; CHECK-NEXT: [[TMP37:%.*]] = ashr i64 [[TMP36]], 32 ; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP37]] ; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP23]], ptr [[TMP38]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]]) ; CHECK-NEXT: [[TMP39:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP40:%.*]] = xor i1 [[TMP39]], true ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index c3b0bc8c00a74..bc99d50b7b761 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -10,10 +10,10 @@ define i64 @test(ptr %a, ptr %b) #0 { ; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 -; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: Cost of 0 for VF 8: WIDEN-REDUCTION-PHI ir<{{.+}}> = phi vp<{{.+}}>, ir<{{.+}}> ; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] -; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: Cost of 0 for VF 16: WIDEN-REDUCTION-PHI ir<{{.+}}> = phi vp<{{.+}}>, ir<{{.+}}> ; CHECK: Cost for VF 16: 56 ; CHECK: LV: Selecting VF: 16 entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index 137e07336fd50..fa11706a12b0b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -124,10 +124,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; PRED-NEXT: [[TMP23:%.*]] = trunc [[TMP21]] to ; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] ; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP23]], ptr [[TMP26]], i32 1, [[ACTIVE_LANE_MASK]]) -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]] ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]]) ; PRED-NEXT: [[TMP25:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP27:%.*]] = xor i1 [[TMP25]], true +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]] ; PRED-NEXT: br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] @@ -291,11 +291,11 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: store i32 1, ptr [[TMP23]], align 4 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; PRED: [[PRED_STORE_CONTINUE2]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]]) ; PRED-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP25:%.*]] = xor i1 [[TMP24]], true ; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] @@ -480,11 +480,11 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: store i32 [[TMP34]], ptr [[TMP33]], align 4 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE7]] ; PRED: [[PRED_STORE_CONTINUE7]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]]) ; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true ; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; PRED-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] @@ -671,11 +671,11 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; PRED: [[PRED_STORE_CONTINUE6]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]]) ; PRED-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP35:%.*]] = xor i1 [[TMP34]], true ; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; PRED-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll index aed1c3d9fcc4c..53f477e828136 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll @@ -85,10 +85,10 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFCOMMON-NEXT: store double [[TMP19]], ptr [[P]], align 8 ; TFCOMMON-NEXT: br label [[PRED_STORE_CONTINUE6]] ; TFCOMMON: pred.store.continue2: -; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]]) ; TFCOMMON-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; TFCOMMON-NEXT: [[TMP17:%.*]] = xor i1 [[TMP15]], true +; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; TFCOMMON-NEXT: br i1 [[TMP17]], label [[END:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; TFCOMMON: end: ; TFCOMMON-NEXT: ret void @@ -149,12 +149,12 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: store double [[TMP34]], ptr [[P]], align 8 ; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE9]] ; TFA_INTERLEAVE: pred.store.continue9: -; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 2 ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]]) ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[TMP27]], i64 [[TMP3]]) ; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = xor i1 [[TMP26]], true +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; TFA_INTERLEAVE-NEXT: br i1 [[TMP28]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TFA_INTERLEAVE: end: ; TFA_INTERLEAVE-NEXT: ret void diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 666057b18ccd0..939c2d5225397 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -992,11 +992,11 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: store double [[SPEC_SELECT]], ptr [[P]], align 8 ; TFA_INTERLEAVE-NEXT: br label %[[TMP13]] ; TFA_INTERLEAVE: [[TMP13]]: -; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 1 ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]] ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = icmp ult i64 [[TMP20]], [[TMP3]] ; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; TFA_INTERLEAVE-NEXT: br i1 [[TMP21]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TFA_INTERLEAVE: [[END]]: ; TFA_INTERLEAVE-NEXT: ret void diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll index 67e6902b5d32a..2aef013662ad8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll @@ -384,9 +384,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) ; DEFAULT-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1 ; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]] ; DEFAULT: [[PRED_STORE_CONTINUE35]]: -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16) ; DEFAULT-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16) +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] @@ -523,11 +523,11 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 ; DEFAULT-NEXT: [[TMP21:%.*]] = add [[TMP18]], [[TMP20]] ; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] ; DEFAULT-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP21]], ptr [[TMP22]], i32 1, [[ACTIVE_LANE_MASK]]) -; DEFAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; DEFAULT-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; DEFAULT-NEXT: [[TMP24:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; DEFAULT-NEXT: [[TMP23:%.*]] = xor i1 [[TMP24]], true ; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; DEFAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; DEFAULT-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] @@ -590,11 +590,11 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 ; OPTSIZE-NEXT: [[TMP21:%.*]] = add [[TMP18]], [[TMP20]] ; OPTSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] ; OPTSIZE-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP21]], ptr [[TMP22]], i32 1, [[ACTIVE_LANE_MASK]]) -; OPTSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; OPTSIZE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; OPTSIZE-NEXT: [[TMP24:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; OPTSIZE-NEXT: [[TMP23:%.*]] = xor i1 [[TMP24]], true ; OPTSIZE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; OPTSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; OPTSIZE-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] @@ -657,11 +657,11 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 ; MINSIZE-NEXT: [[TMP21:%.*]] = add [[TMP18]], [[TMP20]] ; MINSIZE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] ; MINSIZE-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP21]], ptr [[TMP22]], i32 1, [[ACTIVE_LANE_MASK]]) -; MINSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; MINSIZE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; MINSIZE-NEXT: [[TMP24:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; MINSIZE-NEXT: [[TMP23:%.*]] = xor i1 [[TMP24]], true ; MINSIZE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; MINSIZE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; MINSIZE-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; MINSIZE: [[MIDDLE_BLOCK]]: ; MINSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 792249d7829b0..22215f4c89ebf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1401,10 +1401,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) @@ -1438,10 +1438,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] ; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] ; CHECK-INTERLEAVED-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) @@ -1475,10 +1475,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP18]]) -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll index 8b2da8c4a7047..6161de23ec432 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll @@ -233,10 +233,10 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2) ; PRED-NEXT: [[TMP39:%.*]] = or [[WIDE_MASKED_GATHER]], [[VEC_PHI]] ; PRED-NEXT: [[TMP40:%.*]] = or [[TMP39]], [[WIDE_MASKED_GATHER7]] ; PRED-NEXT: [[TMP41]] = select [[ACTIVE_LANE_MASK]], [[TMP40]], [[VEC_PHI]] -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[IV]], [[TMP2]] ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[IV]], i64 [[TMP10]]) ; PRED-NEXT: [[TMP43:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP42:%.*]] = xor i1 [[TMP43]], true +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[IV]], [[TMP2]] ; PRED-NEXT: br i1 [[TMP42]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[TMP41]]) @@ -456,10 +456,10 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { ; PRED-NEXT: [[TMP20:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] ; PRED-NEXT: [[TMP21:%.*]] = or [[TMP20]], [[VEC_PHI]] ; PRED-NEXT: [[TMP16]] = select [[ACTIVE_LANE_MASK]], [[TMP21]], [[VEC_PHI]] -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]]) ; PRED-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP17:%.*]] = xor i1 [[TMP15]], true +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] ; PRED-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: [[TMP19:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16( [[TMP16]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index e51a925040a49..23cc7e367776a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -74,7 +74,7 @@ define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32 ; CHECK: LV(REG): VF = 16 ; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 48 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 47 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item entry: %cmp100 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index 375e412861777..6310ab4dca889 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -137,10 +137,10 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], splat (float -0.000000e+00) ; CHECK-ORDERED-TF-NEXT: [[TMP9]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP8]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP6]]) ; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = xor i1 [[TMP10]], true +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] @@ -374,7 +374,6 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP26]], [[TMP27]]) ; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = select [[ACTIVE_LANE_MASK8]], [[WIDE_MASKED_LOAD11]], splat (float -0.000000e+00) ; CHECK-ORDERED-TF-NEXT: [[TMP30]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP28]], [[TMP29]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = shl nuw i64 [[TMP31]], 3 ; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], [[TMP32]] @@ -390,6 +389,7 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP39]], i64 [[TMP6]]) ; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = xor i1 [[TMP40]], true +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] @@ -626,10 +626,10 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-TF-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) ; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], splat (float -0.000000e+00) ; CHECK-ORDERED-TF-NEXT: [[TMP16]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP15]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] @@ -859,10 +859,10 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] ; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], splat (float -0.000000e+00) ; CHECK-ORDERED-TF-NEXT: [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP11]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP7]]) ; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br label [[FOR_END_LOOPEXIT:%.*]] @@ -1077,10 +1077,10 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[WIDE_MASKED_LOAD1]], splat (float 3.000000e+00) ; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], splat (float -0.000000e+00) ; CHECK-ORDERED-TF-NEXT: [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP11]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) ; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] @@ -1526,7 +1526,6 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP40]], [[TMP41]]) ; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP36]], splat (float -0.000000e+00) ; CHECK-ORDERED-TF-NEXT: [[TMP44]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP42]], [[TMP43]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = shl nuw i64 [[TMP45]], 3 ; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = add i64 [[INDEX]], [[TMP46]] @@ -1542,6 +1541,7 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP53]], i64 [[TMP6]]) ; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = xor i1 [[TMP54]], true +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] @@ -1836,7 +1836,6 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP40]], [[TMP41]]) ; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = select nnan [[ACTIVE_LANE_MASK8]], [[TMP36]], splat (float -0.000000e+00) ; CHECK-ORDERED-TF-NEXT: [[TMP44]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP42]], [[TMP43]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = shl nuw i64 [[TMP45]], 3 ; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = add i64 [[INDEX]], [[TMP46]] @@ -1852,6 +1851,7 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP53]], i64 [[TMP6]]) ; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = xor i1 [[TMP54]], true +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index f6de370874d12..5d0cedf2263c3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -93,10 +93,10 @@ define void @cost_store_i8(ptr %dst) #0 { ; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] ; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0( zeroinitializer, ptr [[TMP13]], i32 1, [[ACTIVE_LANE_MASK]]) -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]]) ; PRED-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP12:%.*]] = xor i1 [[TMP14]], true +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; PRED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: middle.block: ; PRED-NEXT: br label [[EXIT:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index 32235860dd9e2..b5c8e26092dcc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -109,10 +109,10 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP15]], [[TMP18]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP9]], [[TMP9]]) ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, [[INTERLEAVED_MASK3]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] ; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP19]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] @@ -239,10 +239,10 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg [[TMP13]] to ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 2), [[TMP15]], i32 1, [[TMP12]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] ; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] @@ -373,10 +373,10 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = zext nneg [[TMP15]] to ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP16]] ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 2), [[TMP17]], i32 1, [[TMP14]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT4]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] ; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] @@ -533,10 +533,10 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP9]], [[TMP9]], [[TMP9]], [[TMP9]]) ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK3]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] ; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll index ad9eefd8ee6a1..54c255f961eee 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -62,10 +62,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP15]], true +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll index a22f065415307..5d696d1d96d06 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll @@ -84,9 +84,9 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src, ; CHECK-NEXT: [[TMP4:%.*]] = add nsw [[WIDE_MASKED_LOAD]], splat (i32 42) ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP4]], ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll index 821b9fbbda78f..bb7a78f087110 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll @@ -29,10 +29,10 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: [[TMP14]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) @@ -75,10 +75,10 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) ; CHECK-IN-LOOP-NEXT: [[TMP15]] = add i32 [[VEC_PHI]], [[TMP14]] -; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = xor i1 [[TMP18]], true +; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] ; CHECK-IN-LOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-IN-LOOP: middle.block: ; CHECK-IN-LOOP-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] @@ -137,10 +137,10 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], splat (float -0.000000e+00) ; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] @@ -181,10 +181,10 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], splat (float -0.000000e+00) ; CHECK-IN-LOOP-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) -; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-IN-LOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-IN-LOOP: middle.block: ; CHECK-IN-LOOP-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] @@ -247,10 +247,10 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-NEXT: [[TMP17:%.*]] = xor [[VEC_PHI]], [[WIDE_MASKED_LOAD1]] ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP17]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP20]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP16]], true +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP20]]) @@ -304,10 +304,10 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD1]], zeroinitializer ; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP17]]) ; CHECK-IN-LOOP-NEXT: [[TMP19]] = xor i32 [[VEC_PHI]], [[TMP18]] -; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]] ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP22:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-IN-LOOP-NEXT: [[TMP23:%.*]] = xor i1 [[TMP22]], true +; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]] ; CHECK-IN-LOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-IN-LOOP: middle.block: ; CHECK-IN-LOOP-NEXT: br label [[FOR_END:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll index 90224caa68cd4..e1d2a4bbe8cde 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -53,7 +53,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK7]]) ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK8]]) ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK9]]) -; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]] ; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP64:%.*]] = shl nuw i64 [[TMP63]], 2 ; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX6]], [[TMP64]] @@ -69,6 +68,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP71]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP35:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true +; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]] ; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] @@ -160,7 +160,6 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, [[TMP70]]) ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, [[TMP71]]) ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, [[TMP72]]) -; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP6]] ; CHECK-NEXT: [[TMP85:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP86:%.*]] = shl nuw i64 [[TMP85]], 2 ; CHECK-NEXT: [[TMP87:%.*]] = add i64 [[INDEX6]], [[TMP86]] @@ -176,6 +175,7 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP66:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP67:%.*]] = xor i1 [[TMP66]], true +; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP6]] ; CHECK-NEXT: br i1 [[TMP67]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 552d9e23c33c5..e487a91cea5c3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -26,10 +26,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP13]], true +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] @@ -69,10 +69,10 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]] ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = xor i1 [[TMP6]], true +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] @@ -116,10 +116,10 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[INDEX1]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP12]], true +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] @@ -175,11 +175,11 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP20]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]]) ; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] @@ -227,10 +227,10 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] @@ -280,10 +280,10 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = xor i1 [[TMP14]], true +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] @@ -338,10 +338,10 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[WIDE_MASKED_GATHER]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX1]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[PREDPHI]], ptr [[TMP16]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] @@ -399,10 +399,10 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP13]], true +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] @@ -450,10 +450,10 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP15:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP15]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] @@ -505,10 +505,10 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD2]], splat (i32 1) ; CHECK-NEXT: [[TMP16:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[TMP15]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP16]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = xor i1 [[TMP14]], true +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll index 3b19e9ee1a5a3..0ae1d7ad41aa2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll @@ -555,9 +555,9 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic ; CHECK-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], [[TMP9]] ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_EXIT:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll index 150b79c448005..410d167e75c61 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -27,10 +27,10 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IDX]] ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[IDX]], 4 ; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]]) ; CHECK-NEXT: [[EXTRACT_FIRST_LANE_MASK:%.*]] = extractelement <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], i32 0 ; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[IDX]], 4 ; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br [[FOR_END:label %.*]] @@ -81,10 +81,10 @@ define void @cond_uniform_load(ptr noalias nocapture %dst, ptr nocapture readonl ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[MASK]], <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX6]] ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[PREDPHI]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX6]], 4 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX6]], i64 [[TMP3]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = xor i1 [[TMP8]], true +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX6]], 4 ; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br [[FOR_END:label %.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll index 24d23d81b1f92..9a2d3a4bdc44b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll @@ -184,10 +184,10 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP15]], true +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; DATA_AND_CONTROL_NO_RT_CHECK: middle.block: ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll index e9de5e21228fd..9c886a4128c62 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll @@ -25,9 +25,9 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; CHECK-NEXT: [[TMP6:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] ; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void @@ -63,13 +63,13 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]] ; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP10]], ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK]]) ; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP11]], ptr [[TMP14]], i32 8, [[ACTIVE_LANE_MASK2]]) -; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; INTERLEAVE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1 ; INTERLEAVE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]] ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]]) ; INTERLEAVE-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; INTERLEAVE-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; INTERLEAVE: for.cond.cleanup: ; INTERLEAVE-NEXT: ret void @@ -111,9 +111,9 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; CHECK-NEXT: [[TMP6:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] ; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void @@ -149,13 +149,13 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]] ; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP10]], ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK]]) ; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP11]], ptr [[TMP14]], i32 8, [[ACTIVE_LANE_MASK2]]) -; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; INTERLEAVE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1 ; INTERLEAVE-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]] ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP17]], i64 [[TMP4]]) ; INTERLEAVE-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; INTERLEAVE-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]] ; INTERLEAVE: for.cond.cleanup: ; INTERLEAVE-NEXT: ret void @@ -227,10 +227,10 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 ; INTERLEAVE-NEXT: store double [[TMP8]], ptr [[TMP9]], align 8 ; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE4]] ; INTERLEAVE: pred.store.continue4: -; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; INTERLEAVE-NEXT: [[TMP10:%.*]] = or disjoint i64 [[INDEX]], 1 ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]] ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = icmp ult i64 [[TMP10]], [[TMP0]] +; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK_NEXT]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; INTERLEAVE: for.cond.cleanup: ; INTERLEAVE-NEXT: ret void diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll index bdf832f32964f..f92cbc029ffe4 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll @@ -22,7 +22,6 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 1 for VF 2: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 2: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] ; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n -; CHECK: Cost of 0 for VF 2: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> ; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = vector-pointer ir<%arrayidx> @@ -39,7 +38,6 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 1 for VF 4: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] ; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n -; CHECK: Cost of 0 for VF 4: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> ; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = vector-pointer ir<%arrayidx> @@ -56,7 +54,6 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 1 for VF 8: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 8: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] ; CHECK: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n -; CHECK: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> ; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = vector-pointer ir<%arrayidx> @@ -135,8 +132,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 2: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 2: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] ; CHECK: Cost of 1 for VF 2: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 -; CHECK: Cost of 0 for VF 2: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost of 0 for VF 2: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK: Cost of 0 for VF 2: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV:%.+]]>, ir<1> ; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> ; CHECK: Cost of 0 for VF 2: vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<[[STEPS2]]> @@ -167,8 +163,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 4: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 4: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] ; CHECK: Cost of 1 for VF 4: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 -; CHECK: Cost of 0 for VF 4: EMIT vp<[[CAN_IV:%.]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost of 0 for VF 4: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK: Cost of 0 for VF 4: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV:%.+]]>, ir<1> ; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> ; CHECK: Cost of 0 for VF 4: vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<[[STEPS2]]> @@ -199,8 +194,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 8: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 8: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] ; CHECK: Cost of 1 for VF 8: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 -; CHECK: Cost of 0 for VF 8: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost of 0 for VF 8: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK: Cost of 0 for VF 8: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV:%.+]]>, ir<1> ; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> ; CHECK: Cost of 0 for VF 8: vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<[[STEPS2]]> @@ -231,8 +225,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 16: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 16: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] ; CHECK: Cost of 1 for VF 16: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 -; CHECK: Cost of 0 for VF 16: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost of 0 for VF 16: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK: Cost of 0 for VF 16: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV:%.+]]>, ir<1> ; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> ; CHECK: Cost of 0 for VF 16: vp<[[STEPS2:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep>.1 = ptradd ir<%pDst>, vp<[[STEPS2]]> diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll index dcf4bee728b29..e379420c5cdfa 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll @@ -380,9 +380,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) ; DEFAULT-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1 ; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]] ; DEFAULT: [[PRED_STORE_CONTINUE35]]: -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16) ; DEFAULT-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16) +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll index 0b23206134bc0..43cce8005bbf6 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll @@ -10,28 +10,43 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) { ; CHECK-SAME: ptr noalias [[R:%.*]], ptr noalias [[A:%.*]], i32 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[N]], 0 -; CHECK-NEXT: br i1 [[CMP24]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_END13:.*]] -; CHECK: [[FOR_COND1_PREHEADER_PREHEADER]]: +; CHECK-NEXT: br i1 [[CMP24]], label %[[ITER_CHECK:.*]], label %[[FOR_END13:.*]] +; CHECK: [[ITER_CHECK]]: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP69:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP67:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP131:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP25:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP132:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP133:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP27:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP134:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP28:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP135:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 10 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 12 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDEX]], i64 0, i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP1]], i64 0, i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP2]], i64 0, i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP3]], i64 0, i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP4]], i64 0, i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP5]], i64 0, i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP6]], i64 0, i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP7]], i64 0, i32 0 ; CHECK-NEXT: [[WIDE_VEC35:%.*]] = load <12 x float>, ptr [[TMP13]], align 8 ; CHECK-NEXT: [[STRIDED_VEC36:%.*]] = shufflevector <12 x float> [[WIDE_VEC35]], <12 x float> poison, <2 x i32> ; CHECK-NEXT: [[STRIDED_VEC37:%.*]] = shufflevector <12 x float> [[WIDE_VEC35]], <12 x float> poison, <2 x i32> @@ -60,116 +75,252 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) { ; CHECK-NEXT: [[STRIDED_VEC60:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> ; CHECK-NEXT: [[STRIDED_VEC61:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> ; CHECK-NEXT: [[STRIDED_VEC62:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <12 x float>, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[STRIDED_VEC42:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC49:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC56:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC63:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC64:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC65:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[WIDE_VEC43:%.*]] = load <12 x float>, ptr [[TMP17]], align 8 +; CHECK-NEXT: [[STRIDED_VEC66:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC67:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC68:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC69:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC70:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC71:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[WIDE_VEC50:%.*]] = load <12 x float>, ptr [[TMP18]], align 8 +; CHECK-NEXT: [[STRIDED_VEC72:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC73:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC80:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC81:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC82:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC83:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[WIDE_VEC57:%.*]] = load <12 x float>, ptr [[TMP19]], align 8 +; CHECK-NEXT: [[STRIDED_VEC84:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC85:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC86:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC87:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC88:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC89:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP64:%.*]] = fmul fast <2 x float> [[STRIDED_VEC36]], [[STRIDED_VEC36]] ; CHECK-NEXT: [[TMP97:%.*]] = fmul fast <2 x float> [[STRIDED_VEC43]], [[STRIDED_VEC43]] ; CHECK-NEXT: [[TMP98:%.*]] = fmul fast <2 x float> [[STRIDED_VEC50]], [[STRIDED_VEC50]] ; CHECK-NEXT: [[TMP99:%.*]] = fmul fast <2 x float> [[STRIDED_VEC57]], [[STRIDED_VEC57]] +; CHECK-NEXT: [[TMP100:%.*]] = fmul fast <2 x float> [[STRIDED_VEC42]], [[STRIDED_VEC42]] +; CHECK-NEXT: [[TMP101:%.*]] = fmul fast <2 x float> [[STRIDED_VEC66]], [[STRIDED_VEC66]] +; CHECK-NEXT: [[TMP102:%.*]] = fmul fast <2 x float> [[STRIDED_VEC72]], [[STRIDED_VEC72]] +; CHECK-NEXT: [[TMP103:%.*]] = fmul fast <2 x float> [[STRIDED_VEC84]], [[STRIDED_VEC84]] ; CHECK-NEXT: [[TMP72:%.*]] = fmul fast <2 x float> [[STRIDED_VEC37]], [[STRIDED_VEC37]] ; CHECK-NEXT: [[TMP105:%.*]] = fmul fast <2 x float> [[STRIDED_VEC44]], [[STRIDED_VEC44]] ; CHECK-NEXT: [[TMP106:%.*]] = fmul fast <2 x float> [[STRIDED_VEC51]], [[STRIDED_VEC51]] ; CHECK-NEXT: [[TMP107:%.*]] = fmul fast <2 x float> [[STRIDED_VEC58]], [[STRIDED_VEC58]] +; CHECK-NEXT: [[TMP108:%.*]] = fmul fast <2 x float> [[STRIDED_VEC49]], [[STRIDED_VEC49]] +; CHECK-NEXT: [[TMP109:%.*]] = fmul fast <2 x float> [[STRIDED_VEC67]], [[STRIDED_VEC67]] +; CHECK-NEXT: [[TMP110:%.*]] = fmul fast <2 x float> [[STRIDED_VEC73]], [[STRIDED_VEC73]] +; CHECK-NEXT: [[TMP111:%.*]] = fmul fast <2 x float> [[STRIDED_VEC85]], [[STRIDED_VEC85]] ; CHECK-NEXT: [[TMP80:%.*]] = fadd fast <2 x float> [[TMP72]], [[TMP64]] ; CHECK-NEXT: [[TMP113:%.*]] = fadd fast <2 x float> [[TMP105]], [[TMP97]] ; CHECK-NEXT: [[TMP114:%.*]] = fadd fast <2 x float> [[TMP106]], [[TMP98]] ; CHECK-NEXT: [[TMP115:%.*]] = fadd fast <2 x float> [[TMP107]], [[TMP99]] -; CHECK-NEXT: [[TMP21:%.*]] = fpext <2 x float> [[TMP80]] to <2 x double> -; CHECK-NEXT: [[TMP22:%.*]] = fpext <2 x float> [[TMP113]] to <2 x double> -; CHECK-NEXT: [[TMP23:%.*]] = fpext <2 x float> [[TMP114]] to <2 x double> -; CHECK-NEXT: [[TMP24:%.*]] = fpext <2 x float> [[TMP115]] to <2 x double> -; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <2 x double> [[TMP21]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP26:%.*]] = fadd fast <2 x double> [[TMP22]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <2 x double> [[TMP23]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <2 x double> [[TMP24]], [[VEC_PHI3]] -; CHECK-NEXT: [[TMP100:%.*]] = fmul fast <2 x float> [[STRIDED_VEC38]], [[STRIDED_VEC38]] -; CHECK-NEXT: [[TMP101:%.*]] = fmul fast <2 x float> [[STRIDED_VEC45]], [[STRIDED_VEC45]] -; CHECK-NEXT: [[TMP102:%.*]] = fmul fast <2 x float> [[STRIDED_VEC52]], [[STRIDED_VEC52]] -; CHECK-NEXT: [[TMP103:%.*]] = fmul fast <2 x float> [[STRIDED_VEC59]], [[STRIDED_VEC59]] -; CHECK-NEXT: [[TMP108:%.*]] = fmul fast <2 x float> [[STRIDED_VEC39]], [[STRIDED_VEC39]] -; CHECK-NEXT: [[TMP109:%.*]] = fmul fast <2 x float> [[STRIDED_VEC46]], [[STRIDED_VEC46]] -; CHECK-NEXT: [[TMP110:%.*]] = fmul fast <2 x float> [[STRIDED_VEC53]], [[STRIDED_VEC53]] -; CHECK-NEXT: [[TMP111:%.*]] = fmul fast <2 x float> [[STRIDED_VEC60]], [[STRIDED_VEC60]] ; CHECK-NEXT: [[TMP116:%.*]] = fadd fast <2 x float> [[TMP108]], [[TMP100]] ; CHECK-NEXT: [[TMP117:%.*]] = fadd fast <2 x float> [[TMP109]], [[TMP101]] ; CHECK-NEXT: [[TMP118:%.*]] = fadd fast <2 x float> [[TMP110]], [[TMP102]] ; CHECK-NEXT: [[TMP119:%.*]] = fadd fast <2 x float> [[TMP111]], [[TMP103]] +; CHECK-NEXT: [[TMP40:%.*]] = fpext <2 x float> [[TMP80]] to <2 x double> +; CHECK-NEXT: [[TMP52:%.*]] = fpext <2 x float> [[TMP113]] to <2 x double> +; CHECK-NEXT: [[TMP53:%.*]] = fpext <2 x float> [[TMP114]] to <2 x double> +; CHECK-NEXT: [[TMP54:%.*]] = fpext <2 x float> [[TMP115]] to <2 x double> ; CHECK-NEXT: [[TMP41:%.*]] = fpext <2 x float> [[TMP116]] to <2 x double> ; CHECK-NEXT: [[TMP42:%.*]] = fpext <2 x float> [[TMP117]] to <2 x double> ; CHECK-NEXT: [[TMP43:%.*]] = fpext <2 x float> [[TMP118]] to <2 x double> ; CHECK-NEXT: [[TMP44:%.*]] = fpext <2 x float> [[TMP119]] to <2 x double> +; CHECK-NEXT: [[TMP55:%.*]] = fadd fast <2 x double> [[TMP40]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP49:%.*]] = fadd fast <2 x double> [[TMP52]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP50:%.*]] = fadd fast <2 x double> [[TMP53]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP51:%.*]] = fadd fast <2 x double> [[TMP54]], [[VEC_PHI4]] ; CHECK-NEXT: [[TMP45:%.*]] = fadd fast <2 x double> [[TMP41]], [[TMP25]] ; CHECK-NEXT: [[TMP46:%.*]] = fadd fast <2 x double> [[TMP42]], [[TMP26]] ; CHECK-NEXT: [[TMP47:%.*]] = fadd fast <2 x double> [[TMP43]], [[TMP27]] ; CHECK-NEXT: [[TMP48:%.*]] = fadd fast <2 x double> [[TMP44]], [[TMP28]] -; CHECK-NEXT: [[TMP104:%.*]] = fmul fast <2 x float> [[STRIDED_VEC40]], [[STRIDED_VEC40]] -; CHECK-NEXT: [[TMP142:%.*]] = fmul fast <2 x float> [[STRIDED_VEC47]], [[STRIDED_VEC47]] -; CHECK-NEXT: [[TMP147:%.*]] = fmul fast <2 x float> [[STRIDED_VEC54]], [[STRIDED_VEC54]] -; CHECK-NEXT: [[TMP152:%.*]] = fmul fast <2 x float> [[STRIDED_VEC61]], [[STRIDED_VEC61]] -; CHECK-NEXT: [[TMP112:%.*]] = fmul fast <2 x float> [[STRIDED_VEC41]], [[STRIDED_VEC41]] -; CHECK-NEXT: [[TMP143:%.*]] = fmul fast <2 x float> [[STRIDED_VEC48]], [[STRIDED_VEC48]] -; CHECK-NEXT: [[TMP148:%.*]] = fmul fast <2 x float> [[STRIDED_VEC55]], [[STRIDED_VEC55]] -; CHECK-NEXT: [[TMP153:%.*]] = fmul fast <2 x float> [[STRIDED_VEC62]], [[STRIDED_VEC62]] +; CHECK-NEXT: [[TMP104:%.*]] = fmul fast <2 x float> [[STRIDED_VEC38]], [[STRIDED_VEC38]] +; CHECK-NEXT: [[TMP142:%.*]] = fmul fast <2 x float> [[STRIDED_VEC45]], [[STRIDED_VEC45]] +; CHECK-NEXT: [[TMP147:%.*]] = fmul fast <2 x float> [[STRIDED_VEC52]], [[STRIDED_VEC52]] +; CHECK-NEXT: [[TMP152:%.*]] = fmul fast <2 x float> [[STRIDED_VEC59]], [[STRIDED_VEC59]] +; CHECK-NEXT: [[TMP60:%.*]] = fmul fast <2 x float> [[STRIDED_VEC56]], [[STRIDED_VEC56]] +; CHECK-NEXT: [[TMP67:%.*]] = fmul fast <2 x float> [[STRIDED_VEC68]], [[STRIDED_VEC68]] +; CHECK-NEXT: [[TMP73:%.*]] = fmul fast <2 x float> [[STRIDED_VEC80]], [[STRIDED_VEC80]] +; CHECK-NEXT: [[TMP74:%.*]] = fmul fast <2 x float> [[STRIDED_VEC86]], [[STRIDED_VEC86]] +; CHECK-NEXT: [[TMP112:%.*]] = fmul fast <2 x float> [[STRIDED_VEC39]], [[STRIDED_VEC39]] +; CHECK-NEXT: [[TMP143:%.*]] = fmul fast <2 x float> [[STRIDED_VEC46]], [[STRIDED_VEC46]] +; CHECK-NEXT: [[TMP148:%.*]] = fmul fast <2 x float> [[STRIDED_VEC53]], [[STRIDED_VEC53]] +; CHECK-NEXT: [[TMP153:%.*]] = fmul fast <2 x float> [[STRIDED_VEC60]], [[STRIDED_VEC60]] +; CHECK-NEXT: [[TMP75:%.*]] = fmul fast <2 x float> [[STRIDED_VEC63]], [[STRIDED_VEC63]] +; CHECK-NEXT: [[TMP81:%.*]] = fmul fast <2 x float> [[STRIDED_VEC69]], [[STRIDED_VEC69]] +; CHECK-NEXT: [[TMP70:%.*]] = fmul fast <2 x float> [[STRIDED_VEC81]], [[STRIDED_VEC81]] +; CHECK-NEXT: [[TMP71:%.*]] = fmul fast <2 x float> [[STRIDED_VEC87]], [[STRIDED_VEC87]] ; CHECK-NEXT: [[TMP120:%.*]] = fadd fast <2 x float> [[TMP112]], [[TMP104]] ; CHECK-NEXT: [[TMP144:%.*]] = fadd fast <2 x float> [[TMP143]], [[TMP142]] ; CHECK-NEXT: [[TMP149:%.*]] = fadd fast <2 x float> [[TMP148]], [[TMP147]] ; CHECK-NEXT: [[TMP154:%.*]] = fadd fast <2 x float> [[TMP153]], [[TMP152]] +; CHECK-NEXT: [[TMP76:%.*]] = fadd fast <2 x float> [[TMP75]], [[TMP60]] +; CHECK-NEXT: [[TMP77:%.*]] = fadd fast <2 x float> [[TMP81]], [[TMP67]] +; CHECK-NEXT: [[TMP78:%.*]] = fadd fast <2 x float> [[TMP70]], [[TMP73]] +; CHECK-NEXT: [[TMP79:%.*]] = fadd fast <2 x float> [[TMP71]], [[TMP74]] ; CHECK-NEXT: [[TMP61:%.*]] = fpext <2 x float> [[TMP120]] to <2 x double> ; CHECK-NEXT: [[TMP62:%.*]] = fpext <2 x float> [[TMP144]] to <2 x double> ; CHECK-NEXT: [[TMP63:%.*]] = fpext <2 x float> [[TMP149]] to <2 x double> ; CHECK-NEXT: [[TMP155:%.*]] = fpext <2 x float> [[TMP154]] to <2 x double> -; CHECK-NEXT: [[TMP69]] = fadd fast <2 x double> [[TMP61]], [[TMP45]] -; CHECK-NEXT: [[TMP65]] = fadd fast <2 x double> [[TMP62]], [[TMP46]] -; CHECK-NEXT: [[TMP66]] = fadd fast <2 x double> [[TMP63]], [[TMP47]] -; CHECK-NEXT: [[TMP67]] = fadd fast <2 x double> [[TMP155]], [[TMP48]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP84:%.*]] = fpext <2 x float> [[TMP76]] to <2 x double> +; CHECK-NEXT: [[TMP85:%.*]] = fpext <2 x float> [[TMP77]] to <2 x double> +; CHECK-NEXT: [[TMP86:%.*]] = fpext <2 x float> [[TMP78]] to <2 x double> +; CHECK-NEXT: [[TMP87:%.*]] = fpext <2 x float> [[TMP79]] to <2 x double> +; CHECK-NEXT: [[TMP88:%.*]] = fadd fast <2 x double> [[TMP61]], [[TMP55]] +; CHECK-NEXT: [[TMP89:%.*]] = fadd fast <2 x double> [[TMP62]], [[TMP49]] +; CHECK-NEXT: [[TMP90:%.*]] = fadd fast <2 x double> [[TMP63]], [[TMP50]] +; CHECK-NEXT: [[TMP91:%.*]] = fadd fast <2 x double> [[TMP155]], [[TMP51]] +; CHECK-NEXT: [[TMP92:%.*]] = fadd fast <2 x double> [[TMP84]], [[TMP45]] +; CHECK-NEXT: [[TMP93:%.*]] = fadd fast <2 x double> [[TMP85]], [[TMP46]] +; CHECK-NEXT: [[TMP94:%.*]] = fadd fast <2 x double> [[TMP86]], [[TMP47]] +; CHECK-NEXT: [[TMP95:%.*]] = fadd fast <2 x double> [[TMP87]], [[TMP48]] +; CHECK-NEXT: [[TMP96:%.*]] = fmul fast <2 x float> [[STRIDED_VEC40]], [[STRIDED_VEC40]] +; CHECK-NEXT: [[TMP128:%.*]] = fmul fast <2 x float> [[STRIDED_VEC47]], [[STRIDED_VEC47]] +; CHECK-NEXT: [[TMP129:%.*]] = fmul fast <2 x float> [[STRIDED_VEC54]], [[STRIDED_VEC54]] +; CHECK-NEXT: [[TMP130:%.*]] = fmul fast <2 x float> [[STRIDED_VEC61]], [[STRIDED_VEC61]] +; CHECK-NEXT: [[TMP136:%.*]] = fmul fast <2 x float> [[STRIDED_VEC64]], [[STRIDED_VEC64]] +; CHECK-NEXT: [[TMP137:%.*]] = fmul fast <2 x float> [[STRIDED_VEC70]], [[STRIDED_VEC70]] +; CHECK-NEXT: [[TMP139:%.*]] = fmul fast <2 x float> [[STRIDED_VEC82]], [[STRIDED_VEC82]] +; CHECK-NEXT: [[TMP157:%.*]] = fmul fast <2 x float> [[STRIDED_VEC88]], [[STRIDED_VEC88]] +; CHECK-NEXT: [[TMP159:%.*]] = fmul fast <2 x float> [[STRIDED_VEC41]], [[STRIDED_VEC41]] +; CHECK-NEXT: [[TMP160:%.*]] = fmul fast <2 x float> [[STRIDED_VEC48]], [[STRIDED_VEC48]] +; CHECK-NEXT: [[TMP161:%.*]] = fmul fast <2 x float> [[STRIDED_VEC55]], [[STRIDED_VEC55]] +; CHECK-NEXT: [[TMP162:%.*]] = fmul fast <2 x float> [[STRIDED_VEC62]], [[STRIDED_VEC62]] +; CHECK-NEXT: [[TMP163:%.*]] = fmul fast <2 x float> [[STRIDED_VEC65]], [[STRIDED_VEC65]] +; CHECK-NEXT: [[TMP164:%.*]] = fmul fast <2 x float> [[STRIDED_VEC71]], [[STRIDED_VEC71]] +; CHECK-NEXT: [[TMP165:%.*]] = fmul fast <2 x float> [[STRIDED_VEC83]], [[STRIDED_VEC83]] +; CHECK-NEXT: [[TMP166:%.*]] = fmul fast <2 x float> [[STRIDED_VEC89]], [[STRIDED_VEC89]] +; CHECK-NEXT: [[TMP167:%.*]] = fadd fast <2 x float> [[TMP159]], [[TMP96]] +; CHECK-NEXT: [[TMP168:%.*]] = fadd fast <2 x float> [[TMP160]], [[TMP128]] +; CHECK-NEXT: [[TMP169:%.*]] = fadd fast <2 x float> [[TMP161]], [[TMP129]] +; CHECK-NEXT: [[TMP170:%.*]] = fadd fast <2 x float> [[TMP162]], [[TMP130]] +; CHECK-NEXT: [[TMP171:%.*]] = fadd fast <2 x float> [[TMP163]], [[TMP136]] +; CHECK-NEXT: [[TMP172:%.*]] = fadd fast <2 x float> [[TMP164]], [[TMP137]] +; CHECK-NEXT: [[TMP173:%.*]] = fadd fast <2 x float> [[TMP165]], [[TMP139]] +; CHECK-NEXT: [[TMP174:%.*]] = fadd fast <2 x float> [[TMP166]], [[TMP157]] +; CHECK-NEXT: [[TMP175:%.*]] = fpext <2 x float> [[TMP167]] to <2 x double> +; CHECK-NEXT: [[TMP121:%.*]] = fpext <2 x float> [[TMP168]] to <2 x double> +; CHECK-NEXT: [[TMP122:%.*]] = fpext <2 x float> [[TMP169]] to <2 x double> +; CHECK-NEXT: [[TMP123:%.*]] = fpext <2 x float> [[TMP170]] to <2 x double> +; CHECK-NEXT: [[TMP124:%.*]] = fpext <2 x float> [[TMP171]] to <2 x double> +; CHECK-NEXT: [[TMP125:%.*]] = fpext <2 x float> [[TMP172]] to <2 x double> +; CHECK-NEXT: [[TMP126:%.*]] = fpext <2 x float> [[TMP173]] to <2 x double> +; CHECK-NEXT: [[TMP127:%.*]] = fpext <2 x float> [[TMP174]] to <2 x double> +; CHECK-NEXT: [[TMP69]] = fadd fast <2 x double> [[TMP175]], [[TMP88]] +; CHECK-NEXT: [[TMP65]] = fadd fast <2 x double> [[TMP121]], [[TMP89]] +; CHECK-NEXT: [[TMP66]] = fadd fast <2 x double> [[TMP122]], [[TMP90]] +; CHECK-NEXT: [[TMP131]] = fadd fast <2 x double> [[TMP123]], [[TMP91]] +; CHECK-NEXT: [[TMP132]] = fadd fast <2 x double> [[TMP124]], [[TMP92]] +; CHECK-NEXT: [[TMP133]] = fadd fast <2 x double> [[TMP125]], [[TMP93]] +; CHECK-NEXT: [[TMP134]] = fadd fast <2 x double> [[TMP126]], [[TMP94]] +; CHECK-NEXT: [[TMP135]] = fadd fast <2 x double> [[TMP127]], [[TMP95]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP68]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP65]], [[TMP69]] ; CHECK-NEXT: [[BIN_RDX30:%.*]] = fadd fast <2 x double> [[TMP66]], [[BIN_RDX]] -; CHECK-NEXT: [[TMP156:%.*]] = fadd fast <2 x double> [[TMP67]], [[BIN_RDX30]] +; CHECK-NEXT: [[BIN_RDX64:%.*]] = fadd fast <2 x double> [[TMP131]], [[BIN_RDX30]] +; CHECK-NEXT: [[BIN_RDX65:%.*]] = fadd fast <2 x double> [[TMP132]], [[BIN_RDX64]] +; CHECK-NEXT: [[BIN_RDX66:%.*]] = fadd fast <2 x double> [[TMP133]], [[BIN_RDX65]] +; CHECK-NEXT: [[BIN_RDX67:%.*]] = fadd fast <2 x double> [[TMP134]], [[BIN_RDX66]] +; CHECK-NEXT: [[TMP156:%.*]] = fadd fast <2 x double> [[TMP135]], [[BIN_RDX67]] ; CHECK-NEXT: [[TMP158:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP156]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_FOR_END13_CRIT_EDGE:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP158]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_COND1_PREHEADER_PREHEADER]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_FOR_END13_CRIT_EDGE:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP158]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF69:%.*]] = urem i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_VEC70:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF69]] +; CHECK-NEXT: [[TMP138:%.*]] = insertelement <2 x double> zeroinitializer, double [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT80:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI72:%.*]] = phi <2 x double> [ [[TMP138]], %[[VEC_EPILOG_PH]] ], [ [[TMP176:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX5_REALP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 0 +; CHECK-NEXT: [[WIDE_VEC73:%.*]] = load <12 x float>, ptr [[ARRAYIDX5_REALP]], align 8 +; CHECK-NEXT: [[STRIDED_VEC74:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC75:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC76:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC77:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC78:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC79:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP140:%.*]] = fmul fast <2 x float> [[STRIDED_VEC74]], [[STRIDED_VEC74]] +; CHECK-NEXT: [[TMP141:%.*]] = fmul fast <2 x float> [[STRIDED_VEC75]], [[STRIDED_VEC75]] +; CHECK-NEXT: [[TMP177:%.*]] = fadd fast <2 x float> [[TMP141]], [[TMP140]] +; CHECK-NEXT: [[TMP178:%.*]] = fpext <2 x float> [[TMP177]] to <2 x double> +; CHECK-NEXT: [[TMP179:%.*]] = fadd fast <2 x double> [[TMP178]], [[VEC_PHI72]] +; CHECK-NEXT: [[TMP145:%.*]] = fmul fast <2 x float> [[STRIDED_VEC76]], [[STRIDED_VEC76]] +; CHECK-NEXT: [[TMP146:%.*]] = fmul fast <2 x float> [[STRIDED_VEC77]], [[STRIDED_VEC77]] +; CHECK-NEXT: [[TMP180:%.*]] = fadd fast <2 x float> [[TMP146]], [[TMP145]] +; CHECK-NEXT: [[TMP181:%.*]] = fpext <2 x float> [[TMP180]] to <2 x double> +; CHECK-NEXT: [[TMP182:%.*]] = fadd fast <2 x double> [[TMP181]], [[TMP179]] +; CHECK-NEXT: [[TMP150:%.*]] = fmul fast <2 x float> [[STRIDED_VEC78]], [[STRIDED_VEC78]] +; CHECK-NEXT: [[TMP151:%.*]] = fmul fast <2 x float> [[STRIDED_VEC79]], [[STRIDED_VEC79]] +; CHECK-NEXT: [[TMP183:%.*]] = fadd fast <2 x float> [[TMP151]], [[TMP150]] +; CHECK-NEXT: [[TMP184:%.*]] = fpext <2 x float> [[TMP183]] to <2 x double> +; CHECK-NEXT: [[TMP176]] = fadd fast <2 x double> [[TMP184]], [[TMP182]] +; CHECK-NEXT: [[INDEX_NEXT80]] = add nuw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[TMP185:%.*]] = icmp eq i64 [[INDEX_NEXT80]], [[N_VEC70]] +; CHECK-NEXT: br i1 [[TMP185]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP186:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP176]]) +; CHECK-NEXT: [[CMP_N81:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC70]] +; CHECK-NEXT: br i1 [[CMP_N81]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC70]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX82:%.*]] = phi double [ [[TMP186]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP158]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER:.*]] ; CHECK: [[FOR_COND1_PREHEADER]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[SUM_026:%.*]] = phi double [ [[ADD10_2:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX5_REALP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 0 -; CHECK-NEXT: [[ARRAYIDX5_REAL:%.*]] = load float, ptr [[ARRAYIDX5_REALP]], align 8 -; CHECK-NEXT: [[ARRAYIDX5_IMAGP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 1 +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[SUM_026:%.*]] = phi double [ [[ADD10_2:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_MERGE_RDX82]], %[[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX5_REALP1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 0, i32 0 +; CHECK-NEXT: [[ARRAYIDX5_REAL:%.*]] = load float, ptr [[ARRAYIDX5_REALP1]], align 8 +; CHECK-NEXT: [[ARRAYIDX5_IMAGP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 0, i32 1 ; CHECK-NEXT: [[ARRAYIDX5_IMAG:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[ARRAYIDX5_REAL]], [[ARRAYIDX5_REAL]] ; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[ARRAYIDX5_IMAG]], [[ARRAYIDX5_IMAG]] ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL9]], [[MUL]] ; CHECK-NEXT: [[CONV:%.*]] = fpext float [[ADD]] to double ; CHECK-NEXT: [[ADD10:%.*]] = fadd fast double [[CONV]], [[SUM_026]] -; CHECK-NEXT: [[ARRAYIDX5_REALP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 1, i32 0 +; CHECK-NEXT: [[ARRAYIDX5_REALP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 1, i32 0 ; CHECK-NEXT: [[ARRAYIDX5_REAL_1:%.*]] = load float, ptr [[ARRAYIDX5_REALP_1]], align 8 -; CHECK-NEXT: [[ARRAYIDX5_IMAGP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 1, i32 1 +; CHECK-NEXT: [[ARRAYIDX5_IMAGP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 1, i32 1 ; CHECK-NEXT: [[ARRAYIDX5_IMAG_1:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP_1]], align 8 ; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[ARRAYIDX5_REAL_1]], [[ARRAYIDX5_REAL_1]] ; CHECK-NEXT: [[MUL9_1:%.*]] = fmul fast float [[ARRAYIDX5_IMAG_1]], [[ARRAYIDX5_IMAG_1]] ; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL9_1]], [[MUL_1]] ; CHECK-NEXT: [[CONV_1:%.*]] = fpext float [[ADD_1]] to double ; CHECK-NEXT: [[ADD10_1:%.*]] = fadd fast double [[CONV_1]], [[ADD10]] -; CHECK-NEXT: [[ARRAYIDX5_REALP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 2, i32 0 +; CHECK-NEXT: [[ARRAYIDX5_REALP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 2, i32 0 ; CHECK-NEXT: [[ARRAYIDX5_REAL_2:%.*]] = load float, ptr [[ARRAYIDX5_REALP_2]], align 8 -; CHECK-NEXT: [[ARRAYIDX5_IMAGP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 2, i32 1 +; CHECK-NEXT: [[ARRAYIDX5_IMAGP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 2, i32 1 ; CHECK-NEXT: [[ARRAYIDX5_IMAG_2:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP_2]], align 8 ; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[ARRAYIDX5_REAL_2]], [[ARRAYIDX5_REAL_2]] ; CHECK-NEXT: [[MUL9_2:%.*]] = fmul fast float [[ARRAYIDX5_IMAG_2]], [[ARRAYIDX5_IMAG_2]] ; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL9_2]], [[MUL_2]] ; CHECK-NEXT: [[CONV_2:%.*]] = fpext float [[ADD_2]] to double ; CHECK-NEXT: [[ADD10_2]] = fadd fast double [[CONV_2]], [[ADD10_1]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[FOR_COND_FOR_END13_CRIT_EDGE]]: -; CHECK-NEXT: [[ADD10_2_LCSSA:%.*]] = phi double [ [[ADD10_2]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP158]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD10_2_LCSSA:%.*]] = phi double [ [[ADD10_2]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP158]], %[[MIDDLE_BLOCK]] ], [ [[TMP186]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[PHITMP:%.*]] = fptrunc double [[ADD10_2_LCSSA]] to float ; CHECK-NEXT: br label %[[FOR_END13]] ; CHECK: [[FOR_END13]]: @@ -234,5 +385,6 @@ for.end13: ; preds = %for.cond.for.end13_ ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll index bb85b88f181f7..d72e33a0355a2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll @@ -10,7 +10,6 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) ; CHECK: Cost of 1 for VF 2: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 2: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] ; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 -; CHECK: Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 2: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0> ; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1> ; CHECK: Cost of 0 for VF 2: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4> @@ -28,7 +27,6 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) ; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] ; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 -; CHECK: Cost of 0 for VF 4: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 4: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0> ; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%3>, ir<1> ; CHECK: Cost of 0 for VF 4: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4> diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll index d2f8f2203b724..5d16ce5346bbf 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -169,64 +169,140 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: entry: ; AVX1-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; AVX1-NEXT: br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; AVX1: for.body.preheader: +; AVX1: iter.check: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 -; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX1: vector.main.loop.iter.check: +; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; AVX1: vector.ph: -; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: -; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; AVX1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 8 +; AVX1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 12 ; AVX1-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[INDEX]], 1 ; AVX1-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; AVX1-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP3]], 1 +; AVX1-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP8]], 1 ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP7]] ; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP5]] +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]] ; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2 ; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2 ; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2 +; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC9:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2 +; AVX1-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <8 x i16> [[WIDE_VEC9]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <8 x i16> [[WIDE_VEC9]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32> ; AVX1-NEXT: [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32> +; AVX1-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> +; AVX1-NEXT: [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC12]] to <4 x i32> ; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP7]] ; AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]] +; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]] ; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP22]], align 2 ; AVX1-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP23]], align 2 ; AVX1-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC18:%.*]] = load <8 x i16>, ptr [[TMP17]], align 2 +; AVX1-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC18]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC18]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[WIDE_VEC21:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2 +; AVX1-NEXT: [[STRIDED_VEC24:%.*]] = shufflevector <8 x i16> [[WIDE_VEC21]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC23:%.*]] = shufflevector <8 x i16> [[WIDE_VEC21]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32> ; AVX1-NEXT: [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32> +; AVX1-NEXT: [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32> +; AVX1-NEXT: [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC24]] to <4 x i32> ; AVX1-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]] ; AVX1-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]] -; AVX1-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32> -; AVX1-NEXT: [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32> -; AVX1-NEXT: [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32> -; AVX1-NEXT: [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32> ; AVX1-NEXT: [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]] ; AVX1-NEXT: [[TMP47:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP39]] -; AVX1-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP44]] -; AVX1-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP45]] +; AVX1-NEXT: [[TMP27:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32> +; AVX1-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32> +; AVX1-NEXT: [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> +; AVX1-NEXT: [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC11]] to <4 x i32> +; AVX1-NEXT: [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32> +; AVX1-NEXT: [[TMP32:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32> +; AVX1-NEXT: [[TMP33:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32> +; AVX1-NEXT: [[TMP34:%.*]] = sext <4 x i16> [[STRIDED_VEC23]] to <4 x i32> +; AVX1-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP27]] +; AVX1-NEXT: [[TMP60:%.*]] = mul nsw <4 x i32> [[TMP32]], [[TMP28]] +; AVX1-NEXT: [[TMP67:%.*]] = mul nsw <4 x i32> [[TMP33]], [[TMP29]] +; AVX1-NEXT: [[TMP68:%.*]] = mul nsw <4 x i32> [[TMP34]], [[TMP30]] +; AVX1-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP44]] +; AVX1-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[TMP60]], [[TMP45]] +; AVX1-NEXT: [[TMP69:%.*]] = add nsw <4 x i32> [[TMP67]], [[TMP46]] +; AVX1-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[TMP68]], [[TMP47]] ; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[INDEX]] ; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 4 +; AVX1-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 8 +; AVX1-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 12 ; AVX1-NEXT: store <4 x i32> [[TMP19]], ptr [[TMP21]], align 4 ; AVX1-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP26]], align 4 -; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; AVX1-NEXT: store <4 x i32> [[TMP69]], ptr [[TMP71]], align 4 +; AVX1-NEXT: store <4 x i32> [[TMP70]], ptr [[TMP72]], align 4 +; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; AVX1: vec.epilog.iter.check: +; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX1: vec.epilog.ph: +; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] +; AVX1-NEXT: [[N_MOD_VF24:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; AVX1-NEXT: [[N_VEC25:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF24]] ; AVX1-NEXT: br label [[FOR_BODY1:%.*]] +; AVX1: vec.epilog.vector.body: +; AVX1-NEXT: [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[FOR_BODY1]] ] +; AVX1-NEXT: [[TMP48:%.*]] = shl nuw nsw i64 [[INDEX26]], 1 +; AVX1-NEXT: [[TMP49:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP48]] +; AVX1-NEXT: [[WIDE_VEC27:%.*]] = load <8 x i16>, ptr [[TMP49]], align 2 +; AVX1-NEXT: [[STRIDED_VEC28:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC29:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP50:%.*]] = sext <4 x i16> [[STRIDED_VEC28]] to <4 x i32> +; AVX1-NEXT: [[TMP51:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP48]] +; AVX1-NEXT: [[WIDE_VEC30:%.*]] = load <8 x i16>, ptr [[TMP51]], align 2 +; AVX1-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP52:%.*]] = sext <4 x i16> [[STRIDED_VEC31]] to <4 x i32> +; AVX1-NEXT: [[TMP53:%.*]] = mul nsw <4 x i32> [[TMP52]], [[TMP50]] +; AVX1-NEXT: [[TMP54:%.*]] = sext <4 x i16> [[STRIDED_VEC29]] to <4 x i32> +; AVX1-NEXT: [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC32]] to <4 x i32> +; AVX1-NEXT: [[TMP56:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP54]] +; AVX1-NEXT: [[TMP57:%.*]] = add nsw <4 x i32> [[TMP56]], [[TMP53]] +; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDEX26]] +; AVX1-NEXT: store <4 x i32> [[TMP57]], ptr [[TMP58]], align 4 +; AVX1-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX26]], 4 +; AVX1-NEXT: [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC25]] +; AVX1-NEXT: br i1 [[TMP59]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX1: vec.epilog.middle.block: +; AVX1-NEXT: [[CMP_N34:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC25]] +; AVX1-NEXT: br i1 [[CMP_N34]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]] +; AVX1: vec.epilog.scalar.ph: +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ] +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; AVX1-NEXT: [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]] ; AVX1-NEXT: [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 @@ -248,7 +324,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll index 4f33b5b0464a5..c25ea63e90589 100644 --- a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll +++ b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll @@ -113,8 +113,6 @@ define i64 @invar_cond(i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -167,8 +165,6 @@ define i64 @invar_cond_incoming_ops_reordered(i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll index 615f50124b41d..85f0e326b9663 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll @@ -449,8 +449,8 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) { ; IC4VF4-NEXT: [[TMP113:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> [[TMP109]], <4 x i16> [[VEC_PHI1]] ; IC4VF4-NEXT: [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]] ; IC4VF4-NEXT: [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]] -; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD_3]], splat (i16 -4) +; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; IC4VF4-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IC4VF4: [[MIDDLE_BLOCK]]: ; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]]) @@ -814,8 +814,8 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) { ; IC4VF4-NEXT: [[TMP113:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> [[TMP109]], <4 x i16> [[VEC_PHI1]] ; IC4VF4-NEXT: [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]] ; IC4VF4-NEXT: [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]] -; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD_3]], splat (i16 -4) +; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; IC4VF4-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IC4VF4: [[MIDDLE_BLOCK]]: ; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]]) diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll index 70e730f0284c0..2f580956731ed 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll @@ -63,8 +63,8 @@ define void @ptr_depends_on_sdiv(ptr noalias %dst, i16 noundef %off) { ; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 1 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]] ; CHECK: [[PRED_STORE_CONTINUE4]]: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br [[EXIT:label %.*]] @@ -139,8 +139,8 @@ define void @ptr_depends_on_possibly_poison_value(ptr noalias %dst, i16 %off) { ; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP12]], align 1 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br [[EXIT:label %.*]] @@ -209,8 +209,8 @@ define void @ptr_doesnt_depend_on_poison_or_ub(ptr noalias %dst, i16 noundef %of ; CHECK-NEXT: store i64 [[TMP13]], ptr [[TMP12]], align 1 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br [[EXIT:label %.*]] @@ -285,8 +285,8 @@ define void @ptr_depends_on_possibly_poison_value_from_load(ptr noalias %dst) { ; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP13]], align 1 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br [[EXIT:label %.*]] @@ -358,8 +358,8 @@ define void @ptr_depends_on_noundef_load(ptr noalias %dst) { ; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP13]], align 1 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; CHECK: [[PRED_STORE_CONTINUE2]]: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br [[EXIT:label %.*]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 87b11dc77d417..c176d69d4ae47 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -21,8 +21,9 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> @@ -31,7 +32,7 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw ; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx2> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -89,8 +90,9 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x, ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%y>, ir<%iv> @@ -102,7 +104,7 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x, ; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx2> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -163,8 +165,9 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) { ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: ir<%i> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: WIDEN ir<%cmp> = icmp ult ir<%i>, ir<5> @@ -191,7 +194,7 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) { ; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%d> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -258,8 +261,9 @@ define void @print_interleave_groups(i32 %C, i32 %D) { ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<0> + vp<[[CAN_IV]]> * ir<4> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<4>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<%gep.AB.0> = getelementptr inbounds ir<@AB>, ir<0>, vp<[[STEPS]]> @@ -274,7 +278,7 @@ define void @print_interleave_groups(i32 %C, i32 %D) { ; CHECK-NEXT: store ir<1> to index 1 ; CHECK-NEXT: store ir<2> to index 2 ; CHECK-NEXT: store ir<%AB.3> to index 3 -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -344,8 +348,9 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<%isd> = getelementptr inbounds ir<%asd>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%isd> @@ -379,7 +384,7 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db ; CHECK-NEXT: BLEND ir<%ysd.0> = vp<[[PHI]]> ir<%psd>/vp<[[SEL2]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%isd> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT:} @@ -456,15 +461,16 @@ define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, vp<[[EXP_SCEV]]>, vp<[[VF]]> (truncated to i8) ; CHECK-NEXT: vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<0> + vp<[[CAN_IV]]> * vp<[[EXP_SCEV]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, vp<[[EXP_SCEV]]> ; CHECK-NEXT: WIDEN ir<%v3> = add nuw ir<%iv>, ir<1> ; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%v3>, ir<%gep> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -523,15 +529,16 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]> ; CHECK-NEXT: WIDEN ir<%add> = add ir<%iv>, ir<%off> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -589,8 +596,9 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr % ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<%gep.y> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.y> @@ -601,7 +609,7 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr % ; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%div> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -660,8 +668,9 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) { ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x> @@ -671,7 +680,7 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) { ; CHECK-NEXT: WIDEN ir<%add> = add nuw nsw ir<%div.1>, ir<%div.2> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%gep.x> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -729,8 +738,9 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) { ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%src>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%ld.addr> @@ -761,7 +771,7 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) { ; CHECK-NEXT: CLONE ir<%st.addr> = getelementptr inbounds ir<%dest>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%st.addr> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%st.value> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -829,8 +839,9 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) { ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x> @@ -840,7 +851,7 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) { ; CHECK-NEXT: WIDEN ir<%add> = add nuw nsw ir<%or.1>, ir<%or.2> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x> ; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%add> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -898,15 +909,16 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) { ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%p>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%zext> = zext nneg ir<%l> ; CHECK-NEXT: REPLICATE store ir<%zext>, ir<%p1> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -943,8 +955,9 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vp<[[CAN_IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.1> = phi ir<22>, ir<%for.1.next> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<%gep.ptr> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]> @@ -1016,8 +1029,9 @@ define void @print_select_with_fastmath_flags(ptr noalias %a, ptr noalias %b, pt ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK: vector loop: { +; CHECK-NEXT: vp<[[IV:%.+]]> = CANONICAL-IV ir<0> +; CHECK-EMPTY: ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT_EXIT:%.+]]> ; CHECK-NEXT: vp<[[ST:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds nuw ir<%b>, vp<[[ST]]> ; CHECK-NEXT: vp<[[PTR1:%.+]]> = vector-pointer ir<[[GEP1]]> @@ -1031,7 +1045,7 @@ define void @print_select_with_fastmath_flags(ptr noalias %a, ptr noalias %b, pt ; CHECK-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds nuw ir<%a>, vp<[[ST]]> ; CHECK-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]> ; CHECK-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[SELECT]]> -; CHECK-NEXT: EMIT vp<[[IV_NEXT_EXIT]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; CHECK-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index a943e7ac12b1b..91168d6de21fe 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -57,9 +57,7 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { EXPECT_EQ(&*Plan, VecBB->getPlan()); auto Iter = VecBB->begin(); - auto *CanIV = dyn_cast(&*Iter++); - EXPECT_NE(nullptr, CanIV); - auto *Phi = dyn_cast(&*Iter++); + VPWidenPHIRecipe *Phi = dyn_cast(&*Iter++); EXPECT_NE(nullptr, Phi); VPInstruction *Idx = dyn_cast(&*Iter++); @@ -218,7 +216,6 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) { EXPECT_EQ(VecBB->getParent()->getEntryBasicBlock(), VecBB); auto Iter = VecBB->begin(); - EXPECT_NE(nullptr, dyn_cast(&*Iter++)); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp index e38b4fad80b0e..103626087a33f 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp @@ -23,32 +23,32 @@ using VPPatternMatchTest = VPlanTestBase; TEST_F(VPPatternMatchTest, ScalarIVSteps) { VPlan &Plan = getPlan(); - VPBasicBlock *VPBB = Plan.createVPBasicBlock(""); - VPBuilder Builder(VPBB); - IntegerType *I64Ty = IntegerType::get(C, 64); VPValue *StartV = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0)); - auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DebugLoc()); - Builder.insert(CanonicalIVPHI); + VPRegionBlock *VPR = + Plan.createVPRegionBlock(StartV, DebugLoc::getCompilerGenerated(), ""); + VPCanonicalIV *CanIV = VPR->getCanonicalIV(); + VPBasicBlock *VPBB = Plan.createVPBasicBlock(""); + VPBuilder Builder(VPBB); VPValue *Inc = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1)); VPValue *VF = &Plan.getVF(); - VPValue *Steps = Builder.createScalarIVSteps( - Instruction::Add, nullptr, CanonicalIVPHI, Inc, VF, DebugLoc()); + VPValue *Steps = Builder.createScalarIVSteps(Instruction::Add, nullptr, CanIV, + Inc, VF, DebugLoc()); VPValue *Inc2 = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 2)); - VPValue *Steps2 = Builder.createScalarIVSteps( - Instruction::Add, nullptr, CanonicalIVPHI, Inc2, VF, DebugLoc()); + VPValue *Steps2 = Builder.createScalarIVSteps(Instruction::Add, nullptr, + CanIV, Inc2, VF, DebugLoc()); using namespace VPlanPatternMatch; - ASSERT_TRUE(match(Steps, m_ScalarIVSteps(m_Specific(CanonicalIVPHI), - m_SpecificInt(1), m_Specific(VF)))); + ASSERT_TRUE(match(Steps, m_ScalarIVSteps(m_Specific(CanIV), m_SpecificInt(1), + m_Specific(VF)))); ASSERT_FALSE( - match(Steps2, m_ScalarIVSteps(m_Specific(CanonicalIVPHI), - m_SpecificInt(1), m_Specific(VF)))); - ASSERT_TRUE(match(Steps2, m_ScalarIVSteps(m_Specific(CanonicalIVPHI), - m_SpecificInt(2), m_Specific(VF)))); + match(Steps2, m_ScalarIVSteps(m_Specific(CanIV), m_SpecificInt(1), + m_Specific(VF)))); + ASSERT_TRUE(match(Steps2, m_ScalarIVSteps(m_Specific(CanIV), m_SpecificInt(2), + m_Specific(VF)))); } } // namespace diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp index c2f045bf524e9..b4de7b8780e3b 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp @@ -24,14 +24,12 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero}); VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI}); - auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); VPBasicBlock *VPBB1 = Plan.getEntry(); VPBB1->appendRecipe(UseI); VPBB1->appendRecipe(DefI); VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); - VPBB2->appendRecipe(CanIV); VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1"); VPBlockUtils::connectBlocks(VPBB1, R1); VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader()); @@ -59,15 +57,13 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero}); VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI}); - auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); VPInstruction *BranchOnCond = - new VPInstruction(VPInstruction::BranchOnCond, {CanIV}); + new VPInstruction(VPInstruction::BranchOnCond, {UseI}); VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); VPBB1->appendRecipe(UseI); - VPBB2->appendRecipe(CanIV); VPBB2->appendRecipe(DefI); VPBB2->appendRecipe(BranchOnCond); @@ -100,9 +96,8 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 0)); VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero}); - auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); VPInstruction *BranchOnCond = - new VPInstruction(VPInstruction::BranchOnCond, {CanIV}); + new VPInstruction(VPInstruction::BranchOnCond, {DefI}); auto *Blend = new VPBlendRecipe(Phi, {DefI}, {}); VPBasicBlock *VPBB1 = Plan.getEntry(); @@ -110,7 +105,6 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { VPBasicBlock *VPBB3 = Plan.createVPBasicBlock(""); VPBasicBlock *VPBB4 = Plan.createVPBasicBlock(""); - VPBB2->appendRecipe(CanIV); VPBB3->appendRecipe(Blend); VPBB4->appendRecipe(DefI); VPBB4->appendRecipe(BranchOnCond); @@ -157,8 +151,6 @@ TEST_F(VPVerifierTest, VPPhiIncomingValueDoesntDominateIncomingBlock) { VPPhi *Phi = new VPPhi({DefI}, {}); VPBB2->appendRecipe(Phi); VPBB2->appendRecipe(DefI); - auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); - VPBB3->appendRecipe(CanIV); VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB3, VPBB3, "R1"); VPBlockUtils::connectBlocks(VPBB1, VPBB2); @@ -186,9 +178,8 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPlan &Plan = getPlan(); VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero}); - auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); VPInstruction *BranchOnCond = - new VPInstruction(VPInstruction::BranchOnCond, {CanIV}); + new VPInstruction(VPInstruction::BranchOnCond, {I1}); VPInstruction *BranchOnCond2 = new VPInstruction(VPInstruction::BranchOnCond, {I1}); @@ -197,7 +188,6 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPBB1->appendRecipe(I1); VPBB1->appendRecipe(BranchOnCond2); - VPBB2->appendRecipe(CanIV); VPBB2->appendRecipe(BranchOnCond); VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1"); @@ -220,9 +210,8 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPlan &Plan = getPlan(); VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero}); - auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); VPInstruction *BranchOnCond = - new VPInstruction(VPInstruction::BranchOnCond, {CanIV}); + new VPInstruction(VPInstruction::BranchOnCond, {I1}); VPInstruction *BranchOnCond2 = new VPInstruction(VPInstruction::BranchOnCond, {I1}); @@ -231,7 +220,6 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPBasicBlock *VPBB3 = Plan.createVPBasicBlock(""); VPBB1->appendRecipe(I1); - VPBB2->appendRecipe(CanIV); VPBB2->appendRecipe(BranchOnCond2); VPBB3->appendRecipe(BranchOnCond); @@ -260,8 +248,6 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) { VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); - auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); - VPBB2->appendRecipe(CanIV); VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero}); VPInstruction *BranchOnCond = @@ -289,14 +275,11 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) { TEST_F(VPVerifierTest, NonHeaderPHIInHeader) { VPlan &Plan = getPlan(); VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); - auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); - auto *BranchOnCond = new VPInstruction(VPInstruction::BranchOnCond, {CanIV}); + auto *BranchOnCond = new VPInstruction(VPInstruction::BranchOnCond, {Zero}); VPBasicBlock *VPBB1 = Plan.getEntry(); VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("header"); - VPBB2->appendRecipe(CanIV); - PHINode *PHINode = PHINode::Create(Type::getInt32Ty(C), 2); auto *IRPhi = new VPIRPhi(*PHINode); VPBB2->appendRecipe(IRPhi);