diff --git a/llvm/include/llvm/IR/FMF.h b/llvm/include/llvm/IR/FMF.h index d204d0a0e9d9a..da274a1d6ec43 100644 --- a/llvm/include/llvm/IR/FMF.h +++ b/llvm/include/llvm/IR/FMF.h @@ -23,13 +23,7 @@ class FastMathFlags { unsigned Flags = 0; - FastMathFlags(unsigned F) { - // If all 7 bits are set, turn this into -1. If the number of bits grows, - // this must be updated. This is intended to provide some forward binary - // compatibility insurance for the meaning of 'fast' in case bits are added. - if (F == 0x7F) Flags = ~0U; - else Flags = F; - } + FastMathFlags(unsigned F) : Flags(F) {} public: // This is how the bits are used in Value::SubclassOptionalData so they @@ -43,9 +37,12 @@ class FastMathFlags { NoSignedZeros = (1 << 3), AllowReciprocal = (1 << 4), AllowContract = (1 << 5), - ApproxFunc = (1 << 6) + ApproxFunc = (1 << 6), + FlagEnd = (1 << 7) }; + constexpr static unsigned AllFlagsMask = FlagEnd - 1; + FastMathFlags() = default; static FastMathFlags getFast() { @@ -56,10 +53,10 @@ class FastMathFlags { bool any() const { return Flags != 0; } bool none() const { return Flags == 0; } - bool all() const { return Flags == ~0U; } + bool all() const { return Flags == AllFlagsMask; } void clear() { Flags = 0; } - void set() { Flags = ~0U; } + void set() { Flags = AllFlagsMask; } /// Flag queries bool allowReassoc() const { return 0 != (Flags & AllowReassoc); } diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index b4cd52fef70fd..e9eaec90380cc 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -409,8 +409,8 @@ Value *createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind); /// Overloaded function to generate vector-predication intrinsics for /// reduction. -Value *createSimpleReduction(VectorBuilder &VB, Value *Src, - const RecurrenceDescriptor &Desc); +Value *createSimpleReduction(VectorBuilder &VB, Value *Src, RecurKind RdxKind, + FastMathFlags FMFs); /// Create a reduction of the given vector \p Src for a reduction of the /// kind RecurKind::IAnyOf or RecurKind::FAnyOf. The reduction operation is @@ -422,23 +422,16 @@ Value *createAnyOfReduction(IRBuilderBase &B, Value *Src, /// Create a reduction of the given vector \p Src for a reduction of the /// kind RecurKind::IFindLastIV or RecurKind::FFindLastIV. The reduction /// operation is described by \p Desc. -Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src, +Value *createFindLastIVReduction(IRBuilderBase &B, Value *Src, Value *Start, const RecurrenceDescriptor &Desc); -/// Create a generic reduction using a recurrence descriptor \p Desc -/// Fast-math-flags are propagated using the RecurrenceDescriptor. -Value *createReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, - Value *Src, PHINode *OrigPhi = nullptr); - /// Create an ordered reduction intrinsic using the given recurrence -/// descriptor \p Desc. -Value *createOrderedReduction(IRBuilderBase &B, - const RecurrenceDescriptor &Desc, Value *Src, +/// kind \p RdxKind. +Value *createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start); /// Overloaded function to generate vector-predication intrinsics for ordered /// reduction. -Value *createOrderedReduction(VectorBuilder &VB, - const RecurrenceDescriptor &Desc, Value *Src, +Value *createOrderedReduction(VectorBuilder &VB, RecurKind RdxKind, Value *Src, Value *Start); /// Get the intersection (logical and) of all of the potential IR flags diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 45915c10107b2..f2942ae60fed2 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1209,11 +1209,11 @@ Value *llvm::createAnyOfReduction(IRBuilderBase &Builder, Value *Src, } Value *llvm::createFindLastIVReduction(IRBuilderBase &Builder, Value *Src, + Value *Start, const RecurrenceDescriptor &Desc) { assert(RecurrenceDescriptor::isFindLastIVRecurrenceKind( Desc.getRecurrenceKind()) && "Unexpected reduction kind"); - Value *StartVal = Desc.getRecurrenceStartValue(); Value *Sentinel = Desc.getSentinelValue(); Value *MaxRdx = Src->getType()->isVectorTy() ? Builder.CreateIntMaxReduce(Src, true) @@ -1222,7 +1222,7 @@ Value *llvm::createFindLastIVReduction(IRBuilderBase &Builder, Value *Src, // reduction is sentinel value. Value *Cmp = Builder.CreateCmp(CmpInst::ICMP_NE, MaxRdx, Sentinel, "rdx.select.cmp"); - return Builder.CreateSelect(Cmp, MaxRdx, StartVal, "rdx.select"); + return Builder.CreateSelect(Cmp, MaxRdx, Start, "rdx.select"); } Value *llvm::getReductionIdentity(Intrinsic::ID RdxID, Type *Ty, @@ -1308,41 +1308,21 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, } Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src, - const RecurrenceDescriptor &Desc) { - RecurKind Kind = Desc.getRecurrenceKind(); + RecurKind Kind, FastMathFlags FMFs) { assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && - "AnyOf reduction is not supported."); + !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) && + "AnyOf or FindLastIV reductions are not supported."); Intrinsic::ID Id = getReductionIntrinsicID(Kind); auto *SrcTy = cast(Src->getType()); Type *SrcEltTy = SrcTy->getElementType(); - Value *Iden = getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags()); + Value *Iden = getRecurrenceIdentity(Kind, SrcEltTy, FMFs); Value *Ops[] = {Iden, Src}; return VBuilder.createSimpleReduction(Id, SrcTy, Ops); } -Value *llvm::createReduction(IRBuilderBase &B, - const RecurrenceDescriptor &Desc, Value *Src, - PHINode *OrigPhi) { - // TODO: Support in-order reductions based on the recurrence descriptor. - // All ops in the reduction inherit fast-math-flags from the recurrence - // descriptor. - IRBuilderBase::FastMathFlagGuard FMFGuard(B); - B.setFastMathFlags(Desc.getFastMathFlags()); - - RecurKind RK = Desc.getRecurrenceKind(); - if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) - return createAnyOfReduction(B, Src, Desc, OrigPhi); - if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) - return createFindLastIVReduction(B, Src, Desc); - - return createSimpleReduction(B, Src, RK); -} - -Value *llvm::createOrderedReduction(IRBuilderBase &B, - const RecurrenceDescriptor &Desc, +Value *llvm::createOrderedReduction(IRBuilderBase &B, RecurKind Kind, Value *Src, Value *Start) { - assert((Desc.getRecurrenceKind() == RecurKind::FAdd || - Desc.getRecurrenceKind() == RecurKind::FMulAdd) && + assert((Kind == RecurKind::FAdd || Kind == RecurKind::FMulAdd) && "Unexpected reduction kind"); assert(Src->getType()->isVectorTy() && "Expected a vector type"); assert(!Start->getType()->isVectorTy() && "Expected a scalar type"); @@ -1350,11 +1330,9 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B, return B.CreateFAddReduce(Start, Src); } -Value *llvm::createOrderedReduction(VectorBuilder &VBuilder, - const RecurrenceDescriptor &Desc, +Value *llvm::createOrderedReduction(VectorBuilder &VBuilder, RecurKind Kind, Value *Src, Value *Start) { - assert((Desc.getRecurrenceKind() == RecurKind::FAdd || - Desc.getRecurrenceKind() == RecurKind::FMulAdd) && + assert((Kind == RecurKind::FAdd || Kind == RecurKind::FMulAdd) && "Unexpected reduction kind"); assert(Src->getType()->isVectorTy() && "Expected a vector type"); assert(!Start->getType()->isVectorTy() && "Expected a scalar type"); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index bc44ec11edb7b..82eeece7bb0b7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -442,17 +442,15 @@ class LoopVectorizationPlanner { /// TODO: \p VectorizingEpilogue indicates if the executed VPlan is for the /// epilogue vector loop. It should be removed once the re-use issue has been /// fixed. - /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop - /// to re-use expansion results generated during main plan execution. /// /// Returns a mapping of SCEVs to their expanded IR values. /// Note that this is a temporary workaround needed due to the current /// epilogue handling. - DenseMap - executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, - InnerLoopVectorizer &LB, DominatorTree *DT, - bool VectorizingEpilogue, - const DenseMap *ExpandedSCEVs = nullptr); + DenseMap executePlan(ElementCount VF, unsigned UF, + VPlan &BestPlan, + InnerLoopVectorizer &LB, + DominatorTree *DT, + bool VectorizingEpilogue); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printPlans(raw_ostream &O); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7cd395255163a..3378bf9023a2b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -494,11 +494,8 @@ class InnerLoopVectorizer { /// is generated around the vectorized (and scalar epilogue) loops consisting /// of various checks and bypasses. Return the pre-header block of the new /// loop. In the case of epilogue vectorization, this function is overriden to - /// handle the more complex control flow around the loops. \p ExpandedSCEVs is - /// used to look up SCEV expansions for expressions needed during skeleton - /// creation. - virtual BasicBlock * - createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); + /// handle the more complex control flow around the loops. + virtual BasicBlock *createVectorizedLoopSkeleton(); /// Fix the vectorized code, taking care of header phi's, and more. void fixVectorizedLoop(VPTransformState &State); @@ -526,12 +523,6 @@ class InnerLoopVectorizer { /// count of the original loop for both main loop and epilogue vectorization. void setTripCount(Value *TC) { TripCount = TC; } - // Retrieve the additional bypass value associated with an original - /// induction header phi. - Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const { - return Induction2AdditionalBypassValue.at(OrigPhi); - } - /// Return the additional bypass block which targets the scalar loop by /// skipping the epilogue loop after completing the main loop. BasicBlock *getAdditionalBypassBlock() const { @@ -568,11 +559,6 @@ class InnerLoopVectorizer { /// vector loop preheader, middle block and scalar preheader. void createVectorLoopSkeleton(StringRef Prefix); - /// Create and record the values for induction variables to resume coming from - /// the additional bypass block. - void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, - Value *MainVectorTripCount); - /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. virtual void printDebugTracesAtStart() {} @@ -666,11 +652,6 @@ class InnerLoopVectorizer { /// for cleaning the checks, if vectorization turns out unprofitable. GeneratedRTChecks &RTChecks; - /// Mapping of induction phis to their additional bypass values. They - /// need to be added as operands to phi nodes in the scalar loop preheader - /// after the epilogue skeleton has been created. - DenseMap Induction2AdditionalBypassValue; - /// The additional bypass block which conditionally skips over the epilogue /// loop after executing the main loop. Needed to resume inductions and /// reductions during epilogue vectorization. @@ -733,16 +714,14 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { // Override this function to handle the more complex control flow around the // three loops. - BasicBlock * - createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final { - return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); + BasicBlock *createVectorizedLoopSkeleton() final { + return createEpilogueVectorizedLoopSkeleton(); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. - virtual BasicBlock * - createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; + virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -770,8 +749,7 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { EPI, LVL, CM, BFI, PSI, Check, Plan) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - BasicBlock * - createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; + BasicBlock *createEpilogueVectorizedLoopSkeleton() final; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -801,8 +779,7 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { } /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - BasicBlock * - createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; + BasicBlock *createEpilogueVectorizedLoopSkeleton() final; protected: /// Emits an iteration count bypass check after the main vector loop has @@ -2679,44 +2656,7 @@ static void addFullyUnrolledInstructionsToIgnore( } } -void InnerLoopVectorizer::createInductionAdditionalBypassValues( - const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) { - assert(MainVectorTripCount && "Must have bypass information"); - - Instruction *OldInduction = Legal->getPrimaryInduction(); - IRBuilder<> BypassBuilder(getAdditionalBypassBlock(), - getAdditionalBypassBlock()->getFirstInsertionPt()); - for (const auto &InductionEntry : Legal->getInductionVars()) { - PHINode *OrigPhi = InductionEntry.first; - const InductionDescriptor &II = InductionEntry.second; - Value *Step = getExpandedStep(II, ExpandedSCEVs); - // For the primary induction the additional bypass end value is known. - // Otherwise it is computed. - Value *EndValueFromAdditionalBypass = MainVectorTripCount; - if (OrigPhi != OldInduction) { - auto *BinOp = II.getInductionBinOp(); - // Fast-math-flags propagate from the original induction instruction. - if (isa_and_nonnull(BinOp)) - BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags()); - - // Compute the end value for the additional bypass. - EndValueFromAdditionalBypass = - emitTransformedIndex(BypassBuilder, MainVectorTripCount, - II.getStartValue(), Step, II.getKind(), BinOp); - EndValueFromAdditionalBypass->setName("ind.end"); - } - - // Store the bypass value here, as it needs to be added as operand to its - // scalar preheader phi node after the epilogue skeleton has been created. - // TODO: Directly add as extra operand to the VPResumePHI recipe. - assert(!Induction2AdditionalBypassValue.contains(OrigPhi) && - "entry for OrigPhi already exits"); - Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass; - } -} - -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton( - const SCEV2ValueTy &ExpandedSCEVs) { +BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -7574,7 +7514,8 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( BasicBlock *BypassBlock) { auto *EpiRedResult = dyn_cast(R); if (!EpiRedResult || - EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult) + (EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult && + EpiRedResult->getOpcode() != VPInstruction::ComputeFindLastIVResult)) return; auto *EpiRedHeaderPhi = @@ -7595,14 +7536,17 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( RdxDesc.getRecurrenceKind())) { using namespace llvm::PatternMatch; - Value *Cmp, *OrigResumeV; + Value *Cmp, *OrigResumeV, *CmpOp; bool IsExpectedPattern = match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)), m_Specific(RdxDesc.getSentinelValue()), m_Value(OrigResumeV))) && - match(Cmp, - m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), - m_Specific(RdxDesc.getRecurrenceStartValue()))); + (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), + m_Value(CmpOp))) && + (match(CmpOp, + m_Freeze(m_Specific(RdxDesc.getRecurrenceStartValue()))) || + (CmpOp == RdxDesc.getRecurrenceStartValue() && + isGuaranteedNotToBeUndefOrPoison(CmpOp)))); assert(IsExpectedPattern && "Unexpected reduction resume pattern"); (void)IsExpectedPattern; MainResumeValue = OrigResumeV; @@ -7628,17 +7572,11 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( DenseMap LoopVectorizationPlanner::executePlan( ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, - InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue, - const DenseMap *ExpandedSCEVs) { + InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) { assert(BestVPlan.hasVF(BestVF) && "Trying to execute plan with unsupported VF"); assert(BestVPlan.hasUF(BestUF) && "Trying to execute plan with unsupported UF"); - assert( - ((VectorizingEpilogue && ExpandedSCEVs) || - (!VectorizingEpilogue && !ExpandedSCEVs)) && - "expanded SCEVs to reuse can only be used during epilogue vectorization"); - // TODO: Move to VPlan transform stage once the transition to the VPlan-based // cost model is complete for better cost estimates. VPlanTransforms::unrollByUF(BestVPlan, BestUF, @@ -7657,8 +7595,21 @@ DenseMap LoopVectorizationPlanner::executePlan( // 0. Generate SCEV-dependent code in the entry, including TripCount, before // making any changes to the CFG. - if (!BestVPlan.getEntry()->empty()) - BestVPlan.getEntry()->execute(&State); + DenseMap ExpandedSCEVs; + auto *Entry = cast(BestVPlan.getEntry()); + State.Builder.SetInsertPoint(Entry->getIRBasicBlock()->getTerminator()); + for (VPRecipeBase &R : make_early_inc_range(*Entry)) { + auto *ExpSCEV = dyn_cast(&R); + if (!ExpSCEV) + continue; + ExpSCEV->execute(State); + ExpandedSCEVs[ExpSCEV->getSCEV()] = State.get(ExpSCEV, VPLane(0)); + VPValue *Exp = BestVPlan.getOrAddLiveIn(ExpandedSCEVs[ExpSCEV->getSCEV()]); + ExpSCEV->replaceAllUsesWith(Exp); + if (BestVPlan.getTripCount() == ExpSCEV) + BestVPlan.resetTripCount(Exp); + ExpSCEV->eraseFromParent(); + } if (!ILV.getTripCount()) ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0))); @@ -7668,10 +7619,8 @@ DenseMap LoopVectorizationPlanner::executePlan( // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. - VPBasicBlock *VectorPH = - cast(BestVPlan.getEntry()->getSingleSuccessor()); - State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton( - ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); + VPBasicBlock *VectorPH = cast(Entry->getSingleSuccessor()); + State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); if (VectorizingEpilogue) VPlanTransforms::removeDeadRecipes(BestVPlan); @@ -7712,8 +7661,8 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); auto *MiddleVPBB = BestVPlan.getMiddleBlock(); - // 2.5 When vectorizing the epilogue, fix reduction and induction resume - // values from the additional bypass block. + // 2.5 When vectorizing the epilogue, fix reduction resume values from the + // additional bypass block. if (VectorizingEpilogue) { assert(!ILV.Legal->hasUncountableEarlyExit() && "Epilogue vectorisation not yet supported with early exits"); @@ -7722,12 +7671,6 @@ DenseMap LoopVectorizationPlanner::executePlan( fixReductionScalarResumeWhenVectorizingEpilog( &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock); } - BasicBlock *PH = OrigLoop->getLoopPreheader(); - for (const auto &[IVPhi, _] : Legal->getInductionVars()) { - auto *Inc = cast(IVPhi->getIncomingValueForBlock(PH)); - Value *V = ILV.getInductionAdditionalBypassValue(IVPhi); - Inc->setIncomingValueForBlock(BypassBlock, V); - } } // 2.6. Maintain Loop Hints @@ -7780,7 +7723,7 @@ DenseMap LoopVectorizationPlanner::executePlan( } } - return State.ExpandedSCEVs; + return ExpandedSCEVs; } //===--------------------------------------------------------------------===// @@ -7789,8 +7732,7 @@ DenseMap LoopVectorizationPlanner::executePlan( /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( - const SCEV2ValueTy &ExpandedSCEVs) { +BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { createVectorLoopSkeleton(""); // Generate the code to check the minimum iteration count of the vector @@ -7900,8 +7842,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. BasicBlock * -EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( - const SCEV2ValueTy &ExpandedSCEVs) { +EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { createVectorLoopSkeleton("vec.epilog."); // Now, compare the remaining count and if there aren't enough iterations to @@ -7969,11 +7910,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( Phi->removeIncomingValue(EPI.MemSafetyCheck); } - // Generate bypass values from the additional bypass block. Note that when the - // vectorized epilogue is skipped due to iteration count check, then the - // resume value for the induction variable comes from the trip count of the - // main vector loop, passed as the second argument. - createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount); return LoopVectorPreHeader; } @@ -9666,8 +9602,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (CM.blockNeedsPredicationForAnyReason(BB)) CondOp = RecipeBuilder.getBlockInMask(BB); + // Non-FP RdxDescs will have all fast math flags set, so clear them. + FastMathFlags FMFs = isa(CurrentLinkI) + ? RdxDesc.getFastMathFlags() + : FastMathFlags(); auto *RedRecipe = new VPReductionRecipe( - RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp, + Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp, CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc()); // Append the recipe to the end of the VPBasicBlock because we need to // ensure that it comes after all of it's inputs, including CondOp. @@ -9707,8 +9647,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { return isa(&U) && - cast(&U)->getOpcode() == - VPInstruction::ComputeReductionResult; + (cast(&U)->getOpcode() == + VPInstruction::ComputeReductionResult || + cast(&U)->getOpcode() == + VPInstruction::ComputeFindLastIVResult); }); if (CM.usePredicatedReductionSelect( PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy)) @@ -9751,15 +9693,25 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here // even for in-loop reductions, until the reduction resume value handling is // also modeled in VPlan. - auto *FinalReductionResult = new VPInstruction( - VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); + VPInstruction *FinalReductionResult; + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint(MiddleVPBB, IP); + if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + VPValue *Start = PhiR->getStartValue(); + FinalReductionResult = + Builder.createNaryOp(VPInstruction::ComputeFindLastIVResult, + {PhiR, Start, NewExitingVPV}, ExitDL); + } else { + FinalReductionResult = Builder.createNaryOp( + VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); + } // Update all users outside the vector region. OrigExitingVPV->replaceUsesWithIf( - FinalReductionResult, [](VPUser &User, unsigned) { + FinalReductionResult, [FinalReductionResult](VPUser &User, unsigned) { auto *Parent = cast(&User)->getParent(); - return Parent && !Parent->getParent(); + return FinalReductionResult != &User && !Parent->getParent(); }); - FinalReductionResult->insertBefore(*MiddleVPBB, IP); // Adjust AnyOf reductions; replace the reduction phi for the selected value // with a boolean reduction phi node to check if the condition is true in @@ -9781,7 +9733,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (CmpR->getOperand(I) == PhiR) CmpR->setOperand(I, PhiR->getStartValue()); } - VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(Select); // If the true value of the select is the reduction phi, the new value is @@ -10177,6 +10128,36 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { VPlanTransforms::removeDeadRecipes(MainPlan); using namespace VPlanPatternMatch; + // When vectorizing the epilogue, FindLastIV reductions can introduce multiple + // uses of undef/poison. If the reduction start value may be undef or poison + // it needs to be frozen and the frozen start has to be used when computing + // the reduction result. We also need to use the frozen value in the resume + // phi generated by the main vector loop, as this is also used to compute the + // reduction result after the epilogue vector loop. + auto AddFreezeForFindLastIVReductions = [](VPlan &Plan, + bool UpdateResumePhis) { + VPBuilder Builder(Plan.getEntry()); + for (VPRecipeBase &R : *Plan.getMiddleBlock()) { + auto *VPI = dyn_cast(&R); + if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindLastIVResult) + continue; + VPValue *OrigStart = VPI->getOperand(1); + if (isGuaranteedNotToBeUndefOrPoison(OrigStart->getLiveInIRValue())) + continue; + VPInstruction *Freeze = + Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr"); + VPI->setOperand(1, Freeze); + if (UpdateResumePhis) + OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) { + return Freeze != &U && isa(&U) && + cast(&U)->getOpcode() == + VPInstruction::ResumePhi; + }); + } + }; + AddFreezeForFindLastIVReductions(MainPlan, true); + AddFreezeForFindLastIVReductions(EpiPlan, false); + VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader(); VPValue *VectorTC = &MainPlan.getVectorTripCount(); // If there is a suitable resume value for the canonical induction in the @@ -10204,24 +10185,7 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); - // Re-use the trip count and steps expanded for the main loop, as - // skeleton creation needs it as a value that dominates both the scalar - // and vector epilogue loops - // TODO: This is a workaround needed for epilogue vectorization and it - // should be removed once induction resume value creation is done - // directly in VPlan. - for (auto &R : make_early_inc_range(*Plan.getEntry())) { - auto *ExpandR = dyn_cast(&R); - if (!ExpandR) - continue; - auto *ExpandedVal = - Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second); - ExpandR->replaceAllUsesWith(ExpandedVal); - if (Plan.getTripCount() == ExpandR) - Plan.resetTripCount(ExpandedVal); - ExpandR->eraseFromParent(); - } - + DenseMap ToFrozen; // Ensure that the start values for all header phi recipes are updated before // vectorizing the epilogue loop. for (VPRecipeBase &R : Header->phis()) { @@ -10287,6 +10251,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, ResumeV = Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { + ToFrozen[RdxDesc.getRecurrenceStartValue()] = + cast(ResumeV)->getIncomingValueForBlock( + EPI.MainLoopIterationCountCheck); + // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment // to the resume value. The resume value is adjusted to the sentinel // value when the final value from the main vector loop equals the start @@ -10295,8 +10263,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, // variable. BasicBlock *ResumeBB = cast(ResumeV)->getParent(); IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt()); - Value *Cmp = - Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue()); + Value *Cmp = Builder.CreateICmpEQ( + ResumeV, ToFrozen[RdxDesc.getRecurrenceStartValue()]); ResumeV = Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV); } @@ -10312,6 +10280,62 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV); cast(&R)->setStartValue(StartVal); } + + // For some VPValues in the epilogue plan we must re-use the generated IR + // values from the main plan. Replace them with live-in VPValues. + // TODO: This is a workaround needed for epilogue vectorization and it + // should be removed once induction resume value creation is done + // directly in VPlan. + for (auto &R : make_early_inc_range(*Plan.getEntry())) { + // Re-use frozen values from the main plan for Freeze VPInstructions in the + // epilogue plan. This ensures all users use the same frozen value. + auto *VPI = dyn_cast(&R); + if (VPI && VPI->getOpcode() == Instruction::Freeze) { + VPI->replaceAllUsesWith(Plan.getOrAddLiveIn( + ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue()))); + continue; + } + + // Re-use the trip count and steps expanded for the main loop, as + // skeleton creation needs it as a value that dominates both the scalar + // and vector epilogue loops + auto *ExpandR = dyn_cast(&R); + if (!ExpandR) + continue; + auto *ExpandedVal = + Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second); + ExpandR->replaceAllUsesWith(ExpandedVal); + if (Plan.getTripCount() == ExpandR) + Plan.resetTripCount(ExpandedVal); + ExpandR->eraseFromParent(); + } +} + +// Generate bypass values from the additional bypass block. Note that when the +// vectorized epilogue is skipped due to iteration count check, then the +// resume value for the induction variable comes from the trip count of the +// main vector loop, passed as the second argument. +static Value *createInductionAdditionalBypassValues( + PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder, + const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount, + Instruction *OldInduction) { + Value *Step = getExpandedStep(II, ExpandedSCEVs); + // For the primary induction the additional bypass end value is known. + // Otherwise it is computed. + Value *EndValueFromAdditionalBypass = MainVectorTripCount; + if (OrigPhi != OldInduction) { + auto *BinOp = II.getInductionBinOp(); + // Fast-math-flags propagate from the original induction instruction. + if (isa_and_nonnull(BinOp)) + BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags()); + + // Compute the end value for the additional bypass. + EndValueFromAdditionalBypass = + emitTransformedIndex(BypassBuilder, MainVectorTripCount, + II.getStartValue(), Step, II.getKind(), BinOp); + EndValueFromAdditionalBypass->setName("ind.end"); + } + return EndValueFromAdditionalBypass; } bool LoopVectorizePass::processLoop(Loop *L) { @@ -10699,7 +10723,21 @@ bool LoopVectorizePass::processLoop(Loop *L) { preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, - DT, true, &ExpandedSCEVs); + DT, true); + + // Fix induction resume values from the additional bypass block. + BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock(); + IRBuilder<> BypassBuilder(BypassBlock, + BypassBlock->getFirstInsertionPt()); + BasicBlock *PH = L->getLoopPreheader(); + for (const auto &[IVPhi, II] : LVL.getInductionVars()) { + auto *Inc = cast(IVPhi->getIncomingValueForBlock(PH)); + Value *V = createInductionAdditionalBypassValues( + IVPhi, II, BypassBuilder, ExpandedSCEVs, EPI.VectorTripCount, + LVL.getPrimaryInduction()); + // TODO: Directly add as extra operand to the VPResumePHI recipe. + Inc->setIncomingValueForBlock(BypassBlock, V); + } ++LoopsEpilogueVectorized; if (!MainILV.areSafetyChecksAdded()) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index f4e5b1870894c..1799fb91214d5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -379,10 +379,6 @@ struct VPTransformState { /// memchecks. The actually versioning is performed manually. LoopVersioning *LVer = nullptr; - /// Map SCEVs to their expanded values. Populated when executing - /// VPExpandSCEVRecipes. - DenseMap ExpandedSCEVs; - /// VPlan-based type analysis. VPTypeAnalysis TypeAnalysis; }; @@ -1059,6 +1055,8 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { R->getVPDefID() == VPRecipeBase::VPWidenGEPSC || R->getVPDefID() == VPRecipeBase::VPWidenCastSC || R->getVPDefID() == VPRecipeBase::VPWidenIntrinsicSC || + R->getVPDefID() == VPRecipeBase::VPReductionSC || + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC || R->getVPDefID() == VPRecipeBase::VPReverseVectorPointerSC || R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; @@ -1211,6 +1209,7 @@ class VPInstruction : public VPRecipeWithIRFlags, CanonicalIVIncrementForPart, BranchOnCount, BranchOnCond, + ComputeFindLastIVResult, ComputeReductionResult, // Takes the VPValue to extract from as first operand and the lane or part // to extract as second operand, counting from the end starting with 1 for @@ -2656,39 +2655,41 @@ class VPInterleaveRecipe : public VPRecipeBase { /// A recipe to represent inloop reduction operations, performing a reduction on /// a vector operand into a scalar value, and adding the result to a chain. /// The Operands are {ChainOp, VecOp, [Condition]}. -class VPReductionRecipe : public VPSingleDefRecipe { - /// The recurrence decriptor for the reduction in question. - const RecurrenceDescriptor &RdxDesc; +class VPReductionRecipe : public VPRecipeWithIRFlags { + /// The recurrence kind for the reduction in question. + RecurKind RdxKind; bool IsOrdered; /// Whether the reduction is conditional. bool IsConditional = false; protected: - VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R, - Instruction *I, ArrayRef Operands, - VPValue *CondOp, bool IsOrdered, DebugLoc DL) - : VPSingleDefRecipe(SC, Operands, I, DL), RdxDesc(R), + VPReductionRecipe(const unsigned char SC, RecurKind RdxKind, + FastMathFlags FMFs, Instruction *I, + ArrayRef Operands, VPValue *CondOp, + bool IsOrdered, DebugLoc DL) + : VPRecipeWithIRFlags(SC, Operands, FMFs, DL), RdxKind(RdxKind), IsOrdered(IsOrdered) { if (CondOp) { IsConditional = true; addOperand(CondOp); } + setUnderlyingValue(I); } public: - VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I, + VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, bool IsOrdered, DebugLoc DL = {}) - : VPReductionRecipe(VPDef::VPReductionSC, R, I, + : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I, ArrayRef({ChainOp, VecOp}), CondOp, IsOrdered, DL) {} ~VPReductionRecipe() override = default; VPReductionRecipe *clone() override { - return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(), - getVecOp(), getCondOp(), IsOrdered, - getDebugLoc()); + return new VPReductionRecipe(RdxKind, getFastMathFlags(), + getUnderlyingInstr(), getChainOp(), getVecOp(), + getCondOp(), IsOrdered, getDebugLoc()); } static inline bool classof(const VPRecipeBase *R) { @@ -2714,10 +2715,8 @@ class VPReductionRecipe : public VPSingleDefRecipe { VPSlotTracker &SlotTracker) const override; #endif - /// Return the recurrence decriptor for the in-loop reduction. - const RecurrenceDescriptor &getRecurrenceDescriptor() const { - return RdxDesc; - } + /// Return the recurrence kind for the in-loop reduction. + RecurKind getRecurrenceKind() const { return RdxKind; } /// Return true if the in-loop reduction is ordered. bool isOrdered() const { return IsOrdered; }; /// Return true if the in-loop reduction is conditional. @@ -2738,12 +2737,14 @@ class VPReductionRecipe : public VPSingleDefRecipe { /// The Operands are {ChainOp, VecOp, EVL, [Condition]}. class VPReductionEVLRecipe : public VPReductionRecipe { public: - VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp) + VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp, + DebugLoc DL = {}) : VPReductionRecipe( - VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(), + VPDef::VPReductionEVLSC, R.getRecurrenceKind(), + R.getFastMathFlags(), cast_or_null(R.getUnderlyingValue()), ArrayRef({R.getChainOp(), R.getVecOp(), &EVL}), CondOp, - R.isOrdered(), R.getDebugLoc()) {} + R.isOrdered(), DL) {} ~VPReductionEVLRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 27357ff04b5f2..112b45576030d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -50,6 +50,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return SetResultTyFromOp(); switch (Opcode) { + case Instruction::Freeze: + return inferScalarType(R->getOperand(0)); case Instruction::Select: { Type *ResTy = inferScalarType(R->getOperand(1)); VPValue *OtherV = R->getOperand(2); @@ -64,6 +66,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { inferScalarType(R->getOperand(1)) && "different types inferred for different operands"); return IntegerType::get(Ctx, 1); + case VPInstruction::ComputeFindLastIVResult: case VPInstruction::ComputeReductionResult: { auto *PhiR = cast(R->getOperand(0)); auto *OrigPhi = cast(PhiR->getUnderlyingValue()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 4866426ad8848..0f0bb97d23b71 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -233,6 +233,16 @@ using BinaryVPInstruction_match = BinaryRecipe_match; +template +using TernaryRecipe_match = Recipe_match, + Opcode, Commutative, RecipeTys...>; + +template +using TernaryVPInstruction_match = + TernaryRecipe_match; + template using AllBinaryRecipe_match = @@ -251,6 +261,13 @@ m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1) { return BinaryVPInstruction_match(Op0, Op1); } +template +inline TernaryVPInstruction_match +m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { + return TernaryVPInstruction_match( + {Op0, Op1, Op2}); +} + template inline UnaryVPInstruction_match m_Not(const Op0_t &Op0) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 97be2da24fc37..53a0c3906f21d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -406,6 +406,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { if (isSingleScalar() || isVectorToScalar()) return true; switch (Opcode) { + case Instruction::Freeze: case Instruction::ICmp: case Instruction::Select: case VPInstruction::BranchOnCond: @@ -450,6 +451,10 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *A = State.get(getOperand(0)); return Builder.CreateNot(A, Name); } + case Instruction::Freeze: { + Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); + return Builder.CreateFreeze(Op, Name); + } case Instruction::ICmp: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); @@ -575,6 +580,30 @@ Value *VPInstruction::generate(VPTransformState &State) { Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); return CondBr; } + case VPInstruction::ComputeFindLastIVResult: { + // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary + // and will be removed by breaking up the recipe further. + auto *PhiR = cast(getOperand(0)); + // Get its reduction variable descriptor. + const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + RecurKind RK = RdxDesc.getRecurrenceKind(); + assert(RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK) && + "Unexpected reduction kind"); + assert(!PhiR->isInLoop() && + "In-loop FindLastIV reduction is not supported yet"); + + // The recipe's operands are the reduction phi, followed by one operand for + // each part of the reduction. + unsigned UF = getNumOperands() - 2; + Value *ReducedPartRdx = State.get(getOperand(2)); + for (unsigned Part = 1; Part < UF; ++Part) { + ReducedPartRdx = createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, + State.get(getOperand(2 + Part))); + } + + return createFindLastIVReduction(Builder, ReducedPartRdx, + State.get(getOperand(1), true), RdxDesc); + } case VPInstruction::ComputeReductionResult: { // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary // and will be removed by breaking up the recipe further. @@ -584,6 +613,8 @@ Value *VPInstruction::generate(VPTransformState &State) { const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); RecurKind RK = RdxDesc.getRecurrenceKind(); + assert(!RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK) && + "should be handled by ComputeFindLastIVResult"); Type *PhiTy = OrigPhi->getType(); // The recipe's operands are the reduction phi, followed by one operand for @@ -619,9 +650,6 @@ Value *VPInstruction::generate(VPTransformState &State) { if (Op != Instruction::ICmp && Op != Instruction::FCmp) ReducedPartRdx = Builder.CreateBinOp( (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); - else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) - ReducedPartRdx = - createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, RdxPart); else ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); } @@ -630,11 +658,20 @@ Value *VPInstruction::generate(VPTransformState &State) { // Create the reduction after the loop. Note that inloop reductions create // the target reduction in the loop using a Reduction recipe. if ((State.VF.isVector() || - RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || - RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) && + RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && !PhiR->isInLoop()) { - ReducedPartRdx = - createReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); + // TODO: Support in-order reductions based on the recurrence descriptor. + // All ops in the reduction inherit fast-math-flags from the recurrence + // descriptor. + IRBuilderBase::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); + + if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) + ReducedPartRdx = + createAnyOfReduction(Builder, ReducedPartRdx, RdxDesc, OrigPhi); + else + ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK); + // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (PhiTy != RdxDesc.getRecurrenceType()) @@ -729,6 +766,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractFromEnd || + getOpcode() == VPInstruction::ComputeFindLastIVResult || getOpcode() == VPInstruction::ComputeReductionResult || getOpcode() == VPInstruction::AnyOf; } @@ -787,6 +825,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { if (Instruction::isBinaryOp(getOpcode())) return false; switch (getOpcode()) { + case Instruction::Freeze: case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: @@ -815,6 +854,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case Instruction::Select: case Instruction::Or: case VPInstruction::PtrAdd: + case Instruction::Freeze: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); case VPInstruction::ActiveLaneMask: @@ -825,6 +865,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::BranchOnCond: case VPInstruction::ResumePhi: return true; + case VPInstruction::ComputeFindLastIVResult: + return Op == getOperand(1); }; llvm_unreachable("switch should return"); } @@ -900,6 +942,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ExtractFromEnd: O << "extract-from-end"; break; + case VPInstruction::ComputeFindLastIVResult: + O << "compute-find-last-iv-result"; + break; case VPInstruction::ComputeReductionResult: O << "compute-reduction-result"; break; @@ -1448,7 +1493,6 @@ void VPWidenRecipe::execute(VPTransformState &State) { } case Instruction::Freeze: { Value *Op = State.get(getOperand(0)); - Value *Freeze = Builder.CreateFreeze(Op); State.set(this, Freeze); break; @@ -2218,10 +2262,12 @@ void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Reduction being replicated."); Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); - RecurKind Kind = RdxDesc.getRecurrenceKind(); + RecurKind Kind = getRecurrenceKind(); + assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && + "In-loop AnyOf reductions aren't currently supported"); // Propagate the fast-math flags carried by the underlying instruction. IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); - State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); + State.Builder.setFastMathFlags(getFastMathFlags()); State.setDebugLocFrom(getDebugLoc()); Value *NewVecOp = State.get(getVecOp()); if (VPValue *Cond = getCondOp()) { @@ -2229,12 +2275,7 @@ void VPReductionRecipe::execute(VPTransformState &State) { VectorType *VecTy = dyn_cast(NewVecOp->getType()); Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); - Value *Start; - if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) - Start = RdxDesc.getRecurrenceStartValue(); - else - Start = llvm::getRecurrenceIdentity(Kind, ElementTy, - RdxDesc.getFastMathFlags()); + Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags()); if (State.VF.isVector()) Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start); @@ -2246,21 +2287,22 @@ void VPReductionRecipe::execute(VPTransformState &State) { if (IsOrdered) { if (State.VF.isVector()) NewRed = - createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain); + createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain); else NewRed = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RdxDesc.getOpcode(), PrevInChain, NewVecOp); + (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), + PrevInChain, NewVecOp); PrevInChain = NewRed; NextInChain = NewRed; } else { PrevInChain = State.get(getChainOp(), /*IsScalar*/ true); - NewRed = createReduction(State.Builder, RdxDesc, NewVecOp); + NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) - NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), - NewRed, PrevInChain); + NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain); else NextInChain = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RdxDesc.getOpcode(), NewRed, PrevInChain); + (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed, + PrevInChain); } State.set(this, NextInChain, /*IsScalar*/ true); } @@ -2271,10 +2313,9 @@ void VPReductionEVLRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; // Propagate the fast-math flags carried by the underlying instruction. IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); - const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); - Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); + Builder.setFastMathFlags(getFastMathFlags()); - RecurKind Kind = RdxDesc.getRecurrenceKind(); + RecurKind Kind = getRecurrenceKind(); Value *Prev = State.get(getChainOp(), /*IsScalar*/ true); Value *VecOp = State.get(getVecOp()); Value *EVL = State.get(getEVL(), VPLane(0)); @@ -2291,24 +2332,26 @@ void VPReductionEVLRecipe::execute(VPTransformState &State) { Value *NewRed; if (isOrdered()) { - NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev); + NewRed = createOrderedReduction(VBuilder, Kind, VecOp, Prev); } else { - NewRed = createSimpleReduction(VBuilder, VecOp, RdxDesc); + NewRed = createSimpleReduction(VBuilder, VecOp, Kind, getFastMathFlags()); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev); else - NewRed = Builder.CreateBinOp((Instruction::BinaryOps)RdxDesc.getOpcode(), - NewRed, Prev); + NewRed = Builder.CreateBinOp( + (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), NewRed, + Prev); } State.set(this, NewRed, /*IsScalar*/ true); } InstructionCost VPReductionRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - RecurKind RdxKind = RdxDesc.getRecurrenceKind(); + RecurKind RdxKind = getRecurrenceKind(); Type *ElementTy = Ctx.Types.inferScalarType(this); auto *VectorTy = cast(toVectorTy(ElementTy, VF)); - unsigned Opcode = RdxDesc.getOpcode(); + unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind); + FastMathFlags FMFs = getFastMathFlags(); // TODO: Support any-of and in-loop reductions. assert( @@ -2320,20 +2363,17 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, ForceTargetInstructionCost.getNumOccurrences() > 0) && "In-loop reduction not implemented in VPlan-based cost model currently."); - assert(ElementTy->getTypeID() == RdxDesc.getRecurrenceType()->getTypeID() && - "Inferred type and recurrence type mismatch."); - // Cost = Reduction cost + BinOp cost InstructionCost Cost = Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); - return Cost + Ctx.TTI.getMinMaxReductionCost( - Id, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind); + return Cost + + Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); } - return Cost + Ctx.TTI.getArithmeticReductionCost( - Opcode, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind); + return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs, + Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2344,31 +2384,31 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, O << " = "; getChainOp()->printAsOperand(O, SlotTracker); O << " +"; - if (isa(getUnderlyingInstr())) - O << getUnderlyingInstr()->getFastMathFlags(); - O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; + printFlags(O); + O << " reduce." + << Instruction::getOpcodeName( + RecurrenceDescriptor::getOpcode(getRecurrenceKind())) + << " ("; getVecOp()->printAsOperand(O, SlotTracker); if (isConditional()) { O << ", "; getCondOp()->printAsOperand(O, SlotTracker); } O << ")"; - if (RdxDesc.IntermediateStore) - O << " (with final reduction value stored in invariant address sank " - "outside of loop)"; } void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor(); O << Indent << "REDUCE "; printAsOperand(O, SlotTracker); O << " = "; getChainOp()->printAsOperand(O, SlotTracker); O << " +"; - if (isa(getUnderlyingInstr())) - O << getUnderlyingInstr()->getFastMathFlags(); - O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " ("; + printFlags(O); + O << " vp.reduce." + << Instruction::getOpcodeName( + RecurrenceDescriptor::getOpcode(getRecurrenceKind())) + << " ("; getVecOp()->printAsOperand(O, SlotTracker); O << ", "; getEVL()->printAsOperand(O, SlotTracker); @@ -2377,9 +2417,6 @@ void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, getCondOp()->printAsOperand(O, SlotTracker); } O << ")"; - if (RdxDesc.IntermediateStore) - O << " (with final reduction value stored in invariant address sank " - "outside of loop)"; } #endif @@ -3336,23 +3373,10 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, void VPExpandSCEVRecipe::execute(VPTransformState &State) { assert(!State.Lane && "cannot be used in per-lane"); - if (State.ExpandedSCEVs.contains(Expr)) { - // SCEV Expr has already been expanded, result must already be set. At the - // moment we have to execute the entry block twice (once before skeleton - // creation to get expanded SCEVs used by the skeleton and once during - // regular VPlan execution). - State.Builder.SetInsertPoint(State.CFG.VPBB2IRBB[getParent()]); - assert(State.get(this, VPLane(0)) == State.ExpandedSCEVs[Expr] && - "Results must match"); - return; - } - - const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); + const DataLayout &DL = SE.getDataLayout(); SCEVExpander Exp(SE, DL, "induction", /*PreserveLCSSA=*/true); - Value *Res = Exp.expandCodeFor(Expr, Expr->getType(), &*State.Builder.GetInsertPoint()); - State.ExpandedSCEVs[Expr] = Res; State.set(this, Res, VPLane(0)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 89e372d6b46cf..3b380e5a5e6f5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -348,7 +348,9 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { // the parts to compute the final reduction value. VPValue *Op1; if (match(&R, m_VPInstruction( - m_VPValue(), m_VPValue(Op1)))) { + m_VPValue(), m_VPValue(Op1))) || + match(&R, m_VPInstruction( + m_VPValue(), m_VPValue(), m_VPValue(Op1)))) { addUniformForAllParts(cast(&R)); for (unsigned Part = 1; Part != UF; ++Part) R.addOperand(getValueForPart(Op1, Part)); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll new file mode 100644 index 0000000000000..c70b782b74229 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll @@ -0,0 +1,252 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macosx -S %s | FileCheck %s + +define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { +; CHECK-LABEL: define i8 @select_icmp_var_start( +; CHECK-SAME: ptr [[A:%.*]], i8 [[N:%.*]], i8 [[START:%.*]]) { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i8 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 8 +; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 32 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ splat (i8 -128), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i8> [ splat (i8 -128), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <16 x i8> [[VEC_IND]], splat (i8 16) +; CHECK-NEXT: [[INDEX4:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[TMP5:%.*]] = add i8 [[INDEX4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], splat (i8 3) +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD3]], splat (i8 3) +; CHECK-NEXT: [[TMP10]] = select <16 x i1> [[TMP17]], <16 x i8> [[VEC_IND]], <16 x i8> [[VEC_PHI]] +; CHECK-NEXT: [[TMP11]] = select <16 x i1> [[TMP23]], <16 x i8> [[STEP_ADD]], <16 x i8> [[VEC_PHI2]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[STEP_ADD]], splat (i8 16) +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[RDX_MINMAX]]) +; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP13]], -128 +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[FR]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]] +; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i8 -128, i8 [[BC_MERGE_RDX]] +; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i32 [[TMP2]], 8 +; CHECK-NEXT: [[N_VEC5:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF4]] +; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[N_VEC5]] to i8 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i8> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <8 x i8> poison, i8 [[TMP15]], i64 0 +; CHECK-NEXT: [[DOTSPLAT11:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT10]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX6:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <8 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <8 x i8> [ [[DOTSPLAT11]], %[[VEC_EPILOG_PH]] ], [ [[TMP20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = trunc i32 [[INDEX6]] to i8 +; CHECK-NEXT: [[TMP24:%.*]] = add i8 [[IV]], 0 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[TMP24]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP18]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD12]], splat (i8 3) +; CHECK-NEXT: [[TMP20]] = select <8 x i1> [[TMP19]], <8 x i8> [[VEC_IND7]], <8 x i8> [[VEC_PHI9]] +; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i32 [[INDEX6]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT8]] = add <8 x i8> [[VEC_IND7]], splat (i8 8) +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT13]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP22:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[TMP20]]) +; CHECK-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp ne i8 [[TMP22]], -128 +; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[FR]] +; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX18:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX18]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP1]], align 8 +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 3 +; CHECK-NEXT: [[SEL]] = select i1 [[C]], i8 [[IV1]], i8 [[RDX]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i8 [[IV1]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i8 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i8 [[SEL_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i8 [ %start, %entry ], [ %sel, %loop ] + %gep = getelementptr inbounds i8, ptr %a, i8 %iv + %l = load i8, ptr %gep, align 8 + %c = icmp eq i8 %l, 3 + %sel = select i1 %c, i8 %iv, i8 %rdx + %iv.next = add nuw nsw i8 %iv, 1 + %ec = icmp eq i8 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i8 %sel +} + +define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { +; CHECK-LABEL: define i32 @select_icmp_var_start_iv_trunc( +; CHECK-SAME: i32 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[N_POS:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[N_POS]]) +; CHECK-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[START]] +; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[N_EXT]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) +; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4) +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP4]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI2]] +; CHECK-NEXT: [[TMP5]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI3]] +; CHECK-NEXT: [[TMP6]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]]) +; CHECK-NEXT: [[RDX_MINMAX5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[TMP5]]) +; CHECK-NEXT: [[RDX_MINMAX6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX5]], <4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX6]]) +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP8]], -2147483648 +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[FR]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[FR]] +; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -2147483648, i32 [[BC_MERGE_RDX]] +; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF7]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT9]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT10]], zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0 +; CHECK-NEXT: [[DOTSPLAT14:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT13]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT14]], +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]] +; CHECK-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX11]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], splat (i32 4) +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP14]]) +; CHECK-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp ne i32 [[TMP16]], -2147483648 +; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[FR]] +; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL21:%.*]] = phi i64 [ [[N_VEC8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX22:%.*]] = phi i32 [ [[RDX_SELECT19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL21]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX22]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[START]], 0 +; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-NEXT: [[RED_NEXT]] = select i1 [[C]], i32 [[IV_TRUNC]], i32 [[RED]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N_EXT]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]] +; +entry: + %N.pos = icmp sgt i32 %N, 0 + call void @llvm.assume(i1 %N.pos) + %N.ext = zext i32 %N to i64 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi i32 [ %start, %entry ], [ %red.next, %loop ] + %c = icmp eq i32 %start, 0 + %iv.trunc = trunc i64 %iv to i32 + %red.next = select i1 %c, i32 %iv.trunc, i32 %red + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, %N.ext + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} + +declare void @llvm.assume(i1 noundef) + +attributes #0 = { "target-cpu"="apple-m1" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll index a119707bed120..89bb495045e41 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll @@ -26,7 +26,7 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%conv> = fpext ir<%l> to double -; CHECK-NEXT: WIDEN-CALL ir<%s> = call reassoc nnan ninf nsz arcp contract afn @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64) +; CHECK-NEXT: WIDEN-CALL ir<%s> = call fast @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> @@ -72,7 +72,7 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%conv> = fpext ir<%l> to double -; CHECK-NEXT: WIDEN-INTRINSIC ir<%s> = call reassoc nnan ninf nsz arcp contract afn llvm.sin(ir<%conv>) +; CHECK-NEXT: WIDEN-INTRINSIC ir<%s> = call fast llvm.sin(ir<%conv>) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index f630f4f21e065..e34d0510be290 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -151,11 +151,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF ; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF ; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count +; CHECK-NEXT: ir<%0> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: IR %0 = zext i32 %n to i64 -; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) ; CHECK-NEXT: Successor(s): ir-bb, ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -212,7 +211,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VEC_TC]]> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, ir-bb ; CHECK-EMPTY: @@ -400,11 +399,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF ; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF ; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count +; CHECK-NEXT: ir<%0> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: IR %0 = zext i32 %n to i64 -; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) ; CHECK-NEXT: Successor(s): ir-bb, ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -461,7 +459,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VEC_TC]]> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, ir-bb ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll index a0294f7ac7992..d53339143fc59 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -183,7 +183,7 @@ define void @_Z3fn1v() #0 { ; CHECK-NEXT: [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY50:%.*]] -; CHECK: vec.epilog.vector.body52: +; CHECK: vec.epilog.vector.body50: ; CHECK-NEXT: [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ] ; CHECK-NEXT: [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ] ; CHECK-NEXT: [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ] diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index 06f0f05889116..5398155a77f90 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -epilogue-vectorization-force-VF=4 -S < %s | FileCheck %s define i64 @select_icmp_const(ptr %a, i64 %n) { @@ -212,13 +212,119 @@ loop: exit: ret i64 %sel } -;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -;. + +define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { +; CHECK-LABEL: define i8 @select_icmp_var_start( +; CHECK-SAME: ptr [[A:%.*]], i8 [[N:%.*]], i8 [[START:%.*]]) { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i8 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4 +; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ splat (i8 -128), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[TMP20:%.*]] = add i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[TMP20]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], splat (i8 3) +; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP7]], <4 x i8> [[VEC_IND]], <4 x i8> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4) +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP10:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP8]]) +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i8 [[TMP10]], -128 +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[FR]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]] +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i8 -128, i8 [[BC_MERGE_RDX]] +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[TMP2]], 4 +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF2]] +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[N_VEC3]] to i8 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i8> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i8> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX4]] to i8 +; CHECK-NEXT: [[TMP21:%.*]] = add i8 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[TMP21]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i8>, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD10]], splat (i8 3) +; CHECK-NEXT: [[TMP17]] = select <4 x i1> [[TMP16]], <4 x i8> [[VEC_IND5]], <4 x i8> [[VEC_PHI7]] +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i32 [[INDEX4]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i8> [[VEC_IND5]], splat (i8 4) +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT11]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP18]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP19:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP17]]) +; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP19]], -128 +; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[FR]] +; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N14]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i8 [ [[TMP13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i8 [ [[RDX_SELECT13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP]], align 8 +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 3 +; CHECK-NEXT: [[SEL]] = select i1 [[C]], i8 [[IV]], i8 [[RDX]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i8 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i8 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i8 [[SEL_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i8 [ %start, %entry ], [ %sel, %loop ] + %gep = getelementptr inbounds i8, ptr %a, i8 %iv + %l = load i8, ptr %gep, align 8 + %c = icmp eq i8 %l, 3 + %sel = select i1 %c, i8 %iv, i8 %rdx + %iv.next = add nuw nsw i8 %iv, 1 + %ec = icmp eq i8 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i8 %sel +} diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index 29b29c500c46e..601c475df56cd 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -842,16 +842,14 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] ; VEC4_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[INIT:%.*]], [[TMP3]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 -; VEC4_INTERL2-NEXT: [[TMP19:%.*]] = fmul fast <4 x float> [[DOTSPLATINSERT2]], -; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> poison, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: [[BROADCAST:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer +; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = fmul fast <4 x float> [[BROADCAST]], splat (float 4.000000e+00) ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT7]], ; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP4]] -; VEC4_INTERL2-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 -; VEC4_INTERL2-NEXT: [[BROADCAST:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT7]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll index 9ae8f69b50a90..f306d0ccd567b 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45259.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll @@ -16,12 +16,12 @@ define i8 @widget(ptr %arr, i8 %t9) { ; CHECK-NEXT: [[T1_0_LCSSA:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ] ; CHECK-NEXT: [[T1_0_LCSSA4:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ] ; CHECK-NEXT: [[T1_0_LCSSA1:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ] +; CHECK-NEXT: [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64 +; CHECK-NEXT: [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA4]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[ARR1]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[TMP0]] -; CHECK-NEXT: [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA3]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA4]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll index dd3b50b3e060c..c73b5bfe57b6f 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll @@ -5,10 +5,9 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; CHECK: VPlan 'Final VPlan for VF={2},UF={1}' { ; CHECK-NEXT: Live-in ir<[[VFxUF:.+]]> = VF * UF ; CHECK-NEXT: Live-in ir<[[VTC:%.+]]> = vector-trip-count -; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count +; CHECK-NEXT: ir<%0> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV ((-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) ; CHECK-NEXT: Successor(s): ir-bb, ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -86,7 +85,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VTC]]> +; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<%0>, ir<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, ir-bb ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll index beb305f23884e..9f0f3a17f0cbc 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll @@ -62,11 +62,10 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK: Executing best plan with VF=8, UF=2 ; CHECK-NEXT: VPlan 'Final VPlan for VF={8},UF={2}' { ; CHECK-NEXT: Live-in ir<[[VTC:%.+]]> = vector-trip-count -; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count +; CHECK-NEXT: ir<%and> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: IR %and = and i64 %N, 15 -; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64) ; CHECK-NEXT: Successor(s): ir-bb, ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -92,7 +91,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: EMIT vp<[[C:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VTC]]> +; CHECK-NEXT: EMIT vp<[[C:%.+]]> = icmp eq ir<%and>, ir<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[C]]> ; CHECK-NEXT: Successor(s): ir-bb, ir-bb ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll new file mode 100644 index 0000000000000..11b4efb08bb2e --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -0,0 +1,267 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -disable-output %s 2>&1 | FileCheck %s + +; Tests for printing VPlans with reductions. + +define float @print_reduction(i64 %n, ptr noalias %y) { +; CHECK-LABEL: Checking a loop in 'print_reduction' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0> +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) +; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb +; CHECK-NEXT: IR %red.next.lcssa = phi float [ %red.next, %loop ] (extra operand: vp<[[RED_EX]]> from middle.block) +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %red = phi float [ %red.next, %loop ], [ 0.0, %entry ] + %arrayidx = getelementptr inbounds float, ptr %y, i64 %iv + %lv = load float, ptr %arrayidx, align 4 + %red.next = fadd fast float %lv, %red + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret float %red.next +} + +define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr noalias %dst) { +; CHECK-LABEL: Checking a loop in 'print_reduction_with_invariant_store' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next> +; CHECK-NEXT: vp<[[IV:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[IV]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: CLONE store vp<[[RED_RES]]>, ir<%dst> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0> +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) +; CHECK-NEXT: IR %red = phi float [ %red.next, %loop ], [ 0.000000e+00, %entry ] +; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: ; preds = %entry, %loop + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %red = phi float [ %red.next, %loop ], [ 0.0, %entry ] + %arrayidx = getelementptr inbounds float, ptr %y, i64 %iv + %lv = load float, ptr %arrayidx, align 4 + %red.next = fadd fast float %lv, %red + store float %red.next, ptr %dst, align 4 + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void +} + +define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { +; CHECK-LABEL: Checking a loop in 'print_fmuladd_strict' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%sum.07> = phi ir<0.000000e+00>, ir<%muladd> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%l.a> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx2> +; CHECK-NEXT: WIDEN ir<%l.b> = load vp<[[VEC_PTR2]]> +; CHECK-NEXT: EMIT vp<[[FMUL:%.+]]> = fmul nnan ninf nsz ir<%l.a>, ir<%l.b> +; CHECK-NEXT: REDUCE ir<[[MULADD:%.+]]> = ir<%sum.07> + nnan ninf nsz reduce.fadd (vp<[[FMUL]]>) +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>, ir<[[MULADD]]> +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0> +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) +; CHECK-NEXT: IR %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %loop ] (extra operand: vp<[[RED_RESUME]]> from scalar.ph) +; CHECK: IR %exitcond.not = icmp eq i64 %iv.next, %n +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb +; CHECK-NEXT: IR %muladd.lcssa = phi float [ %muladd, %loop ] (extra operand: vp<[[RED_EX]]> from middle.block) +; CHECK-NEXT: No successors +; CHECK-NEXT:} + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %loop ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %l.a = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv + %l.b = load float, ptr %arrayidx2, align 4 + %muladd = tail call nnan ninf nsz float @llvm.fmuladd.f32(float %l.a, float %l.b, float %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret float %muladd +} + +define i64 @find_last_iv(ptr %a, i64 %n, i64 %start) { +; CHECK-LABEL: Checking a loop in 'find_last_iv' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<{{.+}}> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%rdx> = phi ir<-9223372036854775808>, ir<%cond> +; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr inbounds ir<%a>, vp<[[SCALAR_STEPS]]> +; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%l.a> = load vp<[[VEC_PTR]]> +; CHECK-NEXT: WIDEN ir<%cmp2> = icmp eq ir<%l.a>, ir<%start> +; CHECK-NEXT: WIDEN-SELECT ir<%cond> = select ir<%cmp2>, ir<%iv>, ir<%rdx> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[RDX_RES:%.+]]> = compute-find-last-iv-result ir<%rdx>, ir<%start>, ir<%cond> +; CHECK-NEXT: EMIT vp<[[EXT:%.+]]> = extract-from-end vp<[[RDX_RES]]>, ir<1> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%n>, vp<{{.+}}> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT vp<%bc.resume.val> = resume-phi vp<{{.+}}>, ir<0> +; CHECK-NEXT: EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RDX_RES]]>, ir<%start> +; +; CHECK: ir-bb: +; CHECK-NEXT: IR %cond.lcssa = phi i64 [ %cond, %loop ] (extra operand: vp<[[EXT]]> from middle.block) +; CHECK-NEXT: No successors +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %inc, %loop ] + %rdx = phi i64 [ %start, %entry ], [ %cond, %loop ] + %gep.a = getelementptr inbounds i64, ptr %a, i64 %iv + %l.a = load i64, ptr %gep.a, align 8 + %cmp2 = icmp eq i64 %l.a, %start + %cond = select i1 %cmp2, i64 %iv, i64 %rdx + %inc = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret i64 %cond +} diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 00d8de67a3b40..71fe37d1cb985 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -enable-interleaved-mem-accesses=true -enable-masked-interleaved-mem-accesses -force-widen-divrem-via-safe-divisor=0 -disable-output %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-interleaved-mem-accesses=true -enable-masked-interleaved-mem-accesses -force-widen-divrem-via-safe-divisor=0 -disable-output %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -146,141 +146,6 @@ for.end: ; preds = %for.body, %entry ret void } -define float @print_reduction(i64 %n, ptr noalias %y) { -; CHECK-LABEL: Checking a loop in 'print_reduction' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count -; CHECK-NEXT: Live-in ir<%n> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: Successor(s): vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: Successor(s): vector loop -; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> -; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>, ir<%red.next> -; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> -; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph -; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) -; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb -; CHECK-NEXT: IR %red.next.lcssa = phi float [ %red.next, %for.body ] (extra operand: vp<[[RED_EX]]> from middle.block) -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] - %red = phi float [ %red.next, %for.body ], [ 0.0, %entry ] - %arrayidx = getelementptr inbounds float, ptr %y, i64 %iv - %lv = load float, ptr %arrayidx, align 4 - %red.next = fadd fast float %lv, %red - %iv.next = add i64 %iv, 1 - %exitcond = icmp eq i64 %iv.next, %n - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret float %red.next -} - -define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr noalias %dst) { -; CHECK-LABEL: Checking a loop in 'print_reduction_with_invariant_store' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count -; CHECK-NEXT: Live-in ir<%n> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: Successor(s): vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: Successor(s): vector loop -; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next> -; CHECK-NEXT: vp<[[IV:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[IV]]> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> -; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) (with final reduction value stored in invariant address sank outside of loop) -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>, ir<%red.next> -; CHECK-NEXT: CLONE store vp<[[RED_RES]]>, ir<%dst> -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> -; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph -; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) -; CHECK-NEXT: IR %red = phi float [ %red.next, %for.body ], [ 0.000000e+00, %entry ] -; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] - %red = phi float [ %red.next, %for.body ], [ 0.0, %entry ] - %arrayidx = getelementptr inbounds float, ptr %y, i64 %iv - %lv = load float, ptr %arrayidx, align 4 - %red.next = fadd fast float %lv, %red - store float %red.next, ptr %dst, align 4 - %iv.next = add i64 %iv, 1 - %exitcond = icmp eq i64 %iv.next, %n - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - define void @print_replicate_predicated_phi(i64 %n, ptr %x) { ; CHECK-LABEL: Checking a loop in 'print_replicate_predicated_phi' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { @@ -461,80 +326,6 @@ for.end: ret void } -define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { -; CHECK-LABEL: Checking a loop in 'print_fmuladd_strict' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF -; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count -; CHECK-NEXT: Live-in ir<%n> = original trip-count -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: Successor(s): vector.ph -; CHECK-EMPTY: -; CHECK-NEXT: vector.ph: -; CHECK-NEXT: Successor(s): vector loop -; CHECK-EMPTY: -; CHECK-NEXT: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%sum.07> = phi ir<0.000000e+00>, ir<%muladd> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> -; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx> -; CHECK-NEXT: WIDEN ir<%l.a> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]> -; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx2> -; CHECK-NEXT: WIDEN ir<%l.b> = load vp<[[VEC_PTR2]]> -; CHECK-NEXT: EMIT vp<[[FMUL:%.+]]> = fmul nnan ninf nsz ir<%l.a>, ir<%l.b> -; CHECK-NEXT: REDUCE ir<[[MULADD:%.+]]> = ir<%sum.07> + nnan ninf nsz reduce.fadd (vp<[[FMUL]]>) -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block -; CHECK-EMPTY: -; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>, ir<[[MULADD]]> -; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> -; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph -; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) -; CHECK-NEXT: IR %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] (extra operand: vp<[[RED_RESUME]]> from scalar.ph) -; CHECK: IR %exitcond.not = icmp eq i64 %iv.next, %n -; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb -; CHECK-NEXT: IR %muladd.lcssa = phi float [ %muladd, %for.body ] (extra operand: vp<[[RED_EX]]> from middle.block) -; CHECK-NEXT: No successors -; CHECK-NEXT:} - -entry: - br label %for.body - -for.body: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] - %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] - %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv - %l.a = load float, ptr %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv - %l.b = load float, ptr %arrayidx2, align 4 - %muladd = tail call nnan ninf nsz float @llvm.fmuladd.f32(float %l.a, float %l.b, float %sum.07) - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %n - br i1 %exitcond.not, label %for.end, label %for.body - -for.end: - ret float %muladd -} - define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 { ; CHECK-LABEL: Checking a loop in 'debug_loc_vpinstruction' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { @@ -800,7 +591,7 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr % ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.y> ; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN ir<%add> = fadd nnan ir<%lv>, ir<1.000000e+00> -; CHECK-NEXT: WIDEN ir<%mul> = fmul reassoc nnan ninf nsz arcp contract afn ir<%add>, ir<2.000000e+00> +; CHECK-NEXT: WIDEN ir<%mul> = fmul fast ir<%add>, ir<2.000000e+00> ; CHECK-NEXT: WIDEN ir<%div> = fdiv reassoc nsz contract ir<%mul>, ir<2.000000e+00> ; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x> @@ -1224,8 +1015,8 @@ define void @print_select_with_fastmath_flags(ptr noalias %a, ptr noalias %b, pt ; CHECK-NEXT: vp<[[PTR2:%.+]]> = vector-pointer ir<[[GEP2]]> ; CHECK-NEXT: WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]> ; CHECK-NEXT: WIDEN ir<[[FCMP:%.+]]> = fcmp ogt ir<[[LD1]]>, ir<[[LD2]]> -; CHECK-NEXT: WIDEN ir<[[FADD:%.+]]> = fadd reassoc nnan ninf nsz arcp contract afn ir<[[LD1]]>, ir<1.000000e+01> -; CHECK-NEXT: WIDEN-SELECT ir<[[SELECT:%.+]]> = select reassoc nnan ninf nsz arcp contract afn ir<[[FCMP]]>, ir<[[FADD]]>, ir<[[LD2]]> +; CHECK-NEXT: WIDEN ir<[[FADD:%.+]]> = fadd fast ir<[[LD1]]>, ir<1.000000e+01> +; CHECK-NEXT: WIDEN-SELECT ir<[[SELECT:%.+]]> = select fast ir<[[FCMP]]>, ir<[[FADD]]>, ir<[[LD2]]> ; CHECK-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds nuw ir<%a>, vp<[[ST]]> ; CHECK-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]> ; CHECK-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[SELECT]]> diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index e7987a95f1ca2..3657f7261b9df 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1120,29 +1120,35 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { } { + auto *Add = BinaryOperator::CreateAdd(PoisonValue::get(Int32), + PoisonValue::get(Int32)); VPValue *ChainOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *VecOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3)); - VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, ChainOp, CondOp, - VecOp, false); + VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, + CondOp, VecOp, false); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); EXPECT_FALSE(Recipe.mayWriteToMemory()); EXPECT_FALSE(Recipe.mayReadOrWriteMemory()); + delete Add; } { + auto *Add = BinaryOperator::CreateAdd(PoisonValue::get(Int32), + PoisonValue::get(Int32)); VPValue *ChainOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *VecOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3)); - VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, ChainOp, CondOp, - VecOp, false); + VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, + CondOp, VecOp, false); VPValue *EVL = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 4)); VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp); EXPECT_FALSE(EVLRecipe.mayHaveSideEffects()); EXPECT_FALSE(EVLRecipe.mayReadFromMemory()); EXPECT_FALSE(EVLRecipe.mayWriteToMemory()); EXPECT_FALSE(EVLRecipe.mayReadOrWriteMemory()); + delete Add; } { @@ -1484,28 +1490,34 @@ TEST_F(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) { TEST_F(VPRecipeTest, CastVPReductionRecipeToVPUser) { IntegerType *Int32 = IntegerType::get(C, 32); + auto *Add = BinaryOperator::CreateAdd(PoisonValue::get(Int32), + PoisonValue::get(Int32)); VPValue *ChainOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *VecOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3)); - VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, ChainOp, CondOp, - VecOp, false); + VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, + CondOp, VecOp, false); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa(BaseR)); + delete Add; } TEST_F(VPRecipeTest, CastVPReductionEVLRecipeToVPUser) { IntegerType *Int32 = IntegerType::get(C, 32); + auto *Add = BinaryOperator::CreateAdd(PoisonValue::get(Int32), + PoisonValue::get(Int32)); VPValue *ChainOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *VecOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 2)); VPValue *CondOp = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 3)); - VPReductionRecipe Recipe(RecurrenceDescriptor(), nullptr, ChainOp, CondOp, - VecOp, false); + VPReductionRecipe Recipe(RecurKind::Add, FastMathFlags(), Add, ChainOp, + CondOp, VecOp, false); VPValue *EVL = getPlan().getOrAddLiveIn(ConstantInt::get(Int32, 0)); VPReductionEVLRecipe EVLRecipe(Recipe, *EVL, CondOp); EXPECT_TRUE(isa(&EVLRecipe)); VPRecipeBase *BaseR = &EVLRecipe; EXPECT_TRUE(isa(BaseR)); + delete Add; } } // namespace