diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 39011e7d935ac..01231cb93c680 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -212,6 +212,11 @@ static cl::opt VectorizeCopyableElements( cl::desc("Try to replace values with the idempotent instructions for " "better vectorization.")); +static cl::opt LoopAwareMinTripCount( + "slp-cost-loop-min-trip-count", cl::init(1), cl::Hidden, + cl::desc("Minimum loop trip count, considered by the cost model during " + "modeling (0=loops are ignored and considered flat code)")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -2050,6 +2055,7 @@ class BoUpSLP { UserIgnoreList = nullptr; PostponedGathers.clear(); ValueToGatherNodes.clear(); + LoopNest.clear(); } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -3592,6 +3598,15 @@ class BoUpSLP { TargetTransformInfo::CastContextHint getCastContextHint(const TreeEntry &TE) const; + /// \returns the scale of the given tree entry to the loop iteration. + /// \p Scalar is the scalar value from entry, if using the parent for the + /// external use. + /// \p U is the user of the vectorized value from entry, if using the parent + /// for the external use. + unsigned getScaleToLoopIterations(const TreeEntry &TE, + Value *Scalar = nullptr, + Instruction *U = nullptr) const; + /// \returns the cost of the vectorizable entry. InstructionCost getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, @@ -4492,6 +4507,10 @@ class BoUpSLP { std::tuple, VectorType *, unsigned, bool>> CompressEntryToData; + /// The loop nest, used to check if only single loop nest is vectorized, not + /// multiple, to avoid side-effects fronm loop-aware cost model. + SmallVector LoopNest; + /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L) @@ -9172,6 +9191,33 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, return {IntrinsicCost, LibCost}; } +/// Find the innermost loop starting from \p L, for which at least single value +/// in \p VL is not invariant. +static const Loop *findInnermostNonInvariantLoop(const Loop *L, + ArrayRef VL) { + assert(L && "Expected valid loop"); + auto IsLoopInvariant = [&](const Loop *L, ArrayRef VL) { + return all_of(VL, [&](Value *V) { return L->isLoopInvariant(V); }); + }; + while (L && IsLoopInvariant(L, VL)) + L = L->getParentLoop(); + return L; +} + +/// Get the loop nest for the given loop. +static SmallVector getLoopNest(const Loop *L) { + assert(L && "Expected valid loop"); + SmallVector LoopNest; + if (LoopAwareMinTripCount == 0) + return LoopNest; + while (L) { + LoopNest.push_back(L); + L = L->getParentLoop(); + } + SmallVector Res(LoopNest.rbegin(), LoopNest.rend()); + return Res; +} + BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( const InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, @@ -9179,6 +9225,43 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( assert(S.getMainOp() && "Expected instructions with same/alternate opcodes only."); + // Check the loop nest. Need to be sure, we handle single loop nest at the + // time to avoid incorrect cost estimation because of the loop aware cost + // model. + if (VectorizableTree.empty()) { + assert(LoopNest.empty() && "Expected empty loop nest"); + // Process the first node? Initial fill of the loop nest. + BasicBlock *Parent = S.getMainOp()->getParent(); + if (const Loop *L = LI->getLoopFor(Parent)) { + L = findInnermostNonInvariantLoop(L, VL); + if (L) + LoopNest = getLoopNest(L); + } + } else { + BasicBlock *Parent = S.getMainOp()->getParent(); + if (const Loop *L = LI->getLoopFor(Parent)) { + // Check that the new loop nest is not involved. + // Otherwise, mark it as a gather node. + L = findInnermostNonInvariantLoop(L, VL); + if (L) { + SmallVector NewLoopNest = getLoopNest(L); + for (const auto [L1, L2] : zip_longest(LoopNest, NewLoopNest)) { + if (L1 && L2) { + if (*L1 != *L2) { + LLVM_DEBUG(dbgs() << "SLP: Different loop nest.\n"); + return TreeEntry::NeedToGather; + } + continue; + } + if (!L2) + break; + assert(!L1 && "L1 is expected to be null"); + LoopNest.push_back(*L2); + } + } + } + } + unsigned ShuffleOrOp = S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); Instruction *VL0 = S.getMainOp(); @@ -13468,6 +13551,59 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { return TTI::CastContextHint::None; } +/// Get the minimum loop trip count for the loop \p L. +static unsigned getLoopMinTripCount(const Loop *L, ScalarEvolution &SE) { + if (LoopAwareMinTripCount == 0) + return 1; + // Multiple exiting blocks - skip. + if (!L->getExitingBlock()) + return LoopAwareMinTripCount; + if (unsigned Scale = SE.getSmallConstantTripCount(L)) + return Scale; + return LoopAwareMinTripCount; +} + +unsigned BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar, + Instruction *U) const { + unsigned Scale = 1; + if (TE.State == TreeEntry::SplitVectorize) + return Scale; + BasicBlock *Parent = nullptr; + if (U) { + Parent = U->getParent(); + } else if (TE.isGather()) { + EdgeInfo EI = TE.UserTreeIndex; + while (EI.UserTE) { + if (EI.UserTE->isGather()) { + EI = EI.UserTE->UserTreeIndex; + continue; + } + if (EI.UserTE->State == TreeEntry::Vectorize && + EI.UserTE->getOpcode() == Instruction::PHI) { + auto *PH = cast(EI.UserTE->getMainOp()); + Parent = PH->getIncomingBlock(EI.EdgeIdx); + } else { + Parent = EI.UserTE->getMainOp()->getParent(); + } + break; + } + if (!Parent) + return Scale; + } else { + Parent = TE.getMainOp()->getParent(); + } + if (const Loop *L = LI->getLoopFor(Parent)) { + L = findInnermostNonInvariantLoop(L, Scalar ? ArrayRef(Scalar) + : ArrayRef(TE.Scalars)); + if (L) { + SmallVector Nest = getLoopNest(L); + for (const Loop *L : reverse(Nest)) + Scale *= getLoopMinTripCount(L, *SE); + } + } + return Scale; +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, SmallPtrSetImpl &CheckedExtracts) { @@ -14862,10 +14998,14 @@ InstructionCost BoUpSLP::getSpillCost() { if (It != MinBWs.end()) ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first); auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor()); - Cost += TTI->getCostOfKeepingLiveOverCall(VecTy); + unsigned Scale = getScaleToLoopIterations(*Op); + InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(VecTy); + KeepLiveCost *= Scale; + Cost += KeepLiveCost; if (ScalarTy->isVectorTy()) { // Handle revec dead vector instructions. - Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy); + Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy) * + Scale; } }; // Memoize the relationship between blocks, i.e. if there is (at least one) @@ -15164,7 +15304,7 @@ template struct ShuffledInsertData { InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, InstructionCost ReductionCost) { - InstructionCost Cost = ReductionCost; + InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); @@ -15199,13 +15339,29 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) && "Expected gather nodes with users only."); - InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts); + InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts) * + getScaleToLoopIterations(TE); Cost += C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle " << shortBundleName(TE.Scalars, TE.Idx) << ".\n" << "SLP: Current total cost = " << Cost << "\n"); } + // Add reduced value cost, if resized. + Instruction *ReductionRoot = nullptr; + if (UserIgnoreList) { + const auto It = find_if(*UserIgnoreList, IsaPred); + assert(It != UserIgnoreList->end() && "Expected reduction instruction."); + ReductionRoot = cast(*It); + // Scale reuction cost to the factor of the loop nest trip count. + ReductionCost *= + getScaleToLoopIterations(*VectorizableTree.front().get(), + /*Scalar=*/nullptr, ReductionRoot); + } + + // Add the cost for reduction. + Cost += ReductionCost; + if (Cost >= -SLPCostThreshold && none_of(ExternalUses, [](const ExternalUser &EU) { return isa_and_nonnull(EU.User); @@ -15495,6 +15651,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, } } + ExtraCost *= getScaleToLoopIterations(EU.E, EU.Scalar, + cast_or_null(EU.User)); + ExtractCost += ExtraCost; } // Insert externals for extract of operands of casts to be emitted as scalars @@ -15506,7 +15665,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, TEs.front()->findLaneForValue(V)); } } - // Add reduced value cost, if resized. if (!VectorizedVals.empty()) { const TreeEntry &Root = *VectorizableTree.front(); auto BWIt = MinBWs.find(&Root); @@ -15524,9 +15682,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, assert(SLPReVec && "Only supported by REVEC."); SrcTy = getWidenedType(SrcTy, VecTy->getNumElements()); } - Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy, - TTI::CastContextHint::None, - TTI::TCK_RecipThroughput); + InstructionCost CastCost = + TTI->getCastInstrCost(Opcode, DstTy, SrcTy, + TTI::CastContextHint::None, + TTI::TCK_RecipThroughput) * + getScaleToLoopIterations(Root, /*Scalar=*/nullptr, ReductionRoot); + Cost += CastCost; } } } @@ -15598,6 +15759,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, })) { InstructionCost C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FTy, Mask); + C *= getScaleToLoopIterations(*TEs.front()); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of insertelement " "external users.\n"; @@ -15616,6 +15778,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF); InstructionCost C = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask); + C *= getScaleToLoopIterations(*TEs.back()); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for final shuffle of vector node and external " "insertelement users.\n"; @@ -15669,7 +15832,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, auto *DstVecTy = getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor()); TTI::CastContextHint CCH = getCastContextHint(E); - InstructionCost CastCost; switch (E.getOpcode()) { case Instruction::SExt: case Instruction::ZExt: @@ -15681,8 +15843,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, default: break; } - CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH, - TTI::TCK_RecipThroughput); + InstructionCost CastCost = + TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH, + TTI::TCK_RecipThroughput) * + getScaleToLoopIterations(*VectorizableTree.front().get(), + /*Scalar=*/nullptr, ReductionRoot); Cost += CastCost; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost << " for final resize for reduction from " << SrcVecTy @@ -15706,8 +15871,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, OS << *SpillCost; else OS << ""; - OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"; + OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"; + if (ReductionRoot) + OS << "SLP: Reduction Cost = " << ReductionCost << ".\n"; + OS << "SLP: Total Cost = " << Cost << ".\n"; } LLVM_DEBUG(dbgs() << Str); if (ViewSLPTree) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index 17ae33652b6d8..245d284fb3169 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -136,30 +136,31 @@ for.end: ; preds = %for.body define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-LABEL: @foo3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi float [ [[TMP4]], [[ENTRY]] ], [ [[TMP21:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] ; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 4 +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP21]] = load float, ptr [[ARRAYIDX24]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4 -; CHECK-NEXT: [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP10]] = load float, ptr [[ARRAYIDX19]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP21]], i32 3 ; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x float> [[TMP13]], ; CHECK-NEXT: [[TMP15]] = fadd <4 x float> [[TMP3]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll index f1cd42a2c404a..7641dd5eb4210 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll @@ -86,7 +86,7 @@ for.body: ; preds = %for.body, %entry ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' - ; YAML-NEXT: - Cost: '-5' + ; YAML-NEXT: - Cost: '-40' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4' @@ -96,7 +96,7 @@ for.body: ; preds = %for.body, %entry ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' - ; YAML-NEXT: - Cost: '-7' + ; YAML-NEXT: - Cost: '-56' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '1' diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll index 6d96d6d29cd59..82fcde5319aa0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll @@ -62,7 +62,7 @@ for.body: ; preds = %for.body, %entry ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' - ; YAML-NEXT: - Cost: '-1' + ; YAML-NEXT: - Cost: '-8' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4'