From f927502a0586c490d08b0491781deab6de9fd548 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 4 Sep 2024 17:27:55 +0000 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?= =?UTF-8?q?l=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 14 ++- .../Transforms/Vectorize/SLPVectorizer.cpp | 109 ++++++++++++------ .../reduction-whole-regs-loads.ll | 28 +++-- 3 files changed, 104 insertions(+), 47 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 50dc7d5c54c54..67ded1e21c483 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2531,7 +2531,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { unsigned getNumberOfParts(Type *Tp) { std::pair LT = getTypeLegalizationCost(Tp); - return LT.first.isValid() ? *LT.first.getValue() : 0; + if (!LT.first.isValid()) + return 0; + // Try to find actual number of parts for non-power-of-2 elements as + // ceil(num-of-elements/num-of-subtype-elements). + if (auto *FTp = dyn_cast(Tp); + Tp && LT.second.isFixedLengthVector() && + !has_single_bit(FTp->getNumElements())) { + if (auto *SubTp = dyn_cast_if_present( + EVT(LT.second).getTypeForEVT(Tp->getContext())); + SubTp && SubTp->getElementType() == FTp->getElementType()) + return divideCeil(FTp->getNumElements(), SubTp->getNumElements()); + } + return *LT.first.getValue(); } InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 60476398e5ca7..3c647b36e9849 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) { VF * getNumElements(ScalarTy)); } +/// Returns the number of elements of the given type \p Ty, not less than \p Sz, +/// which forms type, which splits by \p TTI into whole vector types during +/// legalization. +static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, + Type *Ty, unsigned Sz) { + if (!isValidElementType(Ty)) + return bit_ceil(Sz); + // Find the number of elements, which forms full vectors. + const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); + if (NumParts == 0 || NumParts >= Sz) + return bit_ceil(Sz); + return bit_ceil(divideCeil(Sz, NumParts)) * NumParts; +} + static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl &Mask) { // The ShuffleBuilder implementation use shufflevector to splat an "element". @@ -394,7 +408,7 @@ static bool isVectorLikeInstWithConstOps(Value *V) { /// total number of elements \p Size and number of registers (parts) \p /// NumParts. static unsigned getPartNumElems(unsigned Size, unsigned NumParts) { - return PowerOf2Ceil(divideCeil(Size, NumParts)); + return std::min(Size, bit_ceil(divideCeil(Size, NumParts))); } /// Returns correct remaining number of elements, considering total amount \p @@ -1223,6 +1237,22 @@ static bool doesNotNeedToSchedule(ArrayRef VL) { (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts)); } +/// Returns true if widened type of \p Ty elements with size \p Sz represents +/// full vector type, i.e. adding extra element results in extra parts upon type +/// legalization. +static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty, + unsigned Sz) { + if (Sz <= 1) + return false; + if (!isValidElementType(Ty) && !isa(Ty)) + return false; + if (has_single_bit(Sz)) + return true; + const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz)); + return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) && + Sz % NumParts == 0; +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -2466,7 +2496,9 @@ class BoUpSLP { } // TODO: Check if we can remove a check for non-power-2 number of // scalars after full support of non-power-2 vectorization. - return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size()); + return UniqueValues.size() != 2 && + hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(), + UniqueValues.size()); }; // If the initial strategy fails for any of the operand indexes, then we @@ -3275,8 +3307,9 @@ class BoUpSLP { SmallVectorImpl *AltScalars = nullptr) const; /// Return true if this is a non-power-of-2 node. - bool isNonPowOf2Vec() const { - bool IsNonPowerOf2 = !has_single_bit(Scalars.size()); + bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const { + bool IsNonPowerOf2 = !hasFullVectorsOnly( + TTI, getValueType(Scalars.front()), Scalars.size()); assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) && "Reshuffling not supported with non-power-of-2 vectors yet."); return IsNonPowerOf2; @@ -3454,7 +3487,7 @@ class BoUpSLP { if (UserTreeIdx.UserTE) { Last->UserTreeIndices.push_back(UserTreeIdx); - assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) && + assert((!Last->isNonPowOf2Vec(*TTI) || Last->ReorderIndices.empty()) && "Reordering isn't implemented for non-power-of-2 nodes yet"); } return Last; @@ -4732,7 +4765,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order); // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (!Order.empty() && !has_single_bit(VL.size())) { + if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz)) { assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only " "supported with VectorizeNonPowerOf2"); return LoadsState::Gather; @@ -4786,12 +4819,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( }); }); const unsigned AbsoluteDiff = std::abs(*Diff); - if (IsPossibleStrided && (IsAnyPointerUsedOutGraph || - ((Sz > MinProfitableStridedLoads || - (AbsoluteDiff <= MaxProfitableLoadStride * Sz && - has_single_bit(AbsoluteDiff))) && - AbsoluteDiff > Sz) || - *Diff == -(static_cast(Sz) - 1))) { + if (IsPossibleStrided && + (IsAnyPointerUsedOutGraph || + ((Sz > MinProfitableStridedLoads || + (AbsoluteDiff <= MaxProfitableLoadStride * Sz && + hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) && + AbsoluteDiff > Sz) || + *Diff == -(static_cast(Sz) - 1))) { int Stride = *Diff / static_cast(Sz - 1); if (*Diff == Stride * static_cast(Sz - 1)) { Align Alignment = @@ -5196,7 +5230,7 @@ static bool areTwoInsertFromSameBuildVector( std::optional BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. - if (TE.isNonPowOf2Vec()) + if (TE.isNonPowOf2Vec(*TTI)) return std::nullopt; // No need to reorder if need to shuffle reuses, still need to shuffle the @@ -5580,7 +5614,7 @@ void BoUpSLP::reorderTopToBottom() { // Reorder the graph nodes according to their vectorization factor. for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1; - VF /= 2) { + VF -= 2) { auto It = VFToOrderedEntries.find(VF); if (It == VFToOrderedEntries.end()) continue; @@ -5753,7 +5787,7 @@ bool BoUpSLP::canReorderOperands( ArrayRef ReorderableGathers, SmallVectorImpl &GatherOps) { // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (UserTE->isNonPowOf2Vec()) + if (UserTE->isNonPowOf2Vec(*TTI)) return false; for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { @@ -5928,7 +5962,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); const auto AllowsReordering = [&](const TreeEntry *TE) { // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. - if (TE->isNonPowOf2Vec()) + if (TE->isNonPowOf2Vec(*TTI)) return false; if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || @@ -6581,7 +6615,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( case Instruction::ExtractElement: { bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. - if (!has_single_bit(VL.size())) + if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size())) return TreeEntry::NeedToGather; if (Reuse || !CurrentOrder.empty()) return TreeEntry::Vectorize; @@ -6987,24 +7021,25 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, UniqueValues.emplace_back(V); } size_t NumUniqueScalarValues = UniqueValues.size(); - if (NumUniqueScalarValues == VL.size()) { + bool IsFullVectors = + hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(), + NumUniqueScalarValues); + if (NumUniqueScalarValues == VL.size() && + (VectorizeNonPowerOf2 || IsFullVectors)) { ReuseShuffleIndices.clear(); } else { // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. - if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { + if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI)) { LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " "for nodes with padding.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return false; } LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (NumUniqueScalarValues <= 1 || - (UniquePositions.size() == 1 && all_of(UniqueValues, - [](Value *V) { - return isa(V) || - !isConstant(V); - })) || - !llvm::has_single_bit(NumUniqueScalarValues)) { + if (NumUniqueScalarValues <= 1 || !IsFullVectors || + (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) { + return isa(V) || !isConstant(V); + }))) { if (DoNotFail && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() && all_of(UniqueValues, [=](Value *V) { @@ -7012,7 +7047,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, areAllUsersVectorized(cast(V), UserIgnoreList); })) { - unsigned PWSz = PowerOf2Ceil(UniqueValues.size()); + // Find the number of elements, which forms full vectors. + unsigned PWSz = getFullVectorNumberOfElements( + *TTI, UniqueValues.front()->getType(), UniqueValues.size()); if (PWSz == VL.size()) { ReuseShuffleIndices.clear(); } else { @@ -9107,9 +9144,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { return nullptr; Value *VecBase = nullptr; ArrayRef VL = E->Scalars; - // If the resulting type is scalarized, do not adjust the cost. - if (NumParts == VL.size()) - return nullptr; // Check if it can be considered reused if same extractelements were // vectorized already. bool PrevNodeFound = any_of( @@ -9762,7 +9796,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, InsertMask[Idx] = I + 1; } unsigned VecScalarsSz = PowerOf2Ceil(NumElts); - if (NumOfParts > 0) + if (NumOfParts > 0 && NumOfParts < NumElts) VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts); unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * VecScalarsSz; @@ -10991,7 +11025,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { // Keep original scalar if number of externally used instructions in // the same entry is not power of 2. It may help to do some extra // vectorization for now. - KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount); + KeepScalar = + ScalarUsesCount <= 1 || + !hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount); } if (KeepScalar) { ExternalUsesAsOriginalScalar.insert(EU.Scalar); @@ -11688,13 +11724,14 @@ BoUpSLP::isGatherShuffledEntry( if (TE == VectorizableTree.front().get()) return {}; // FIXME: Gathering for non-power-of-2 nodes not implemented yet. - if (TE->isNonPowOf2Vec()) + if (TE->isNonPowOf2Vec(*TTI)) return {}; Mask.assign(VL.size(), PoisonMaskElem); assert(TE->UserTreeIndices.size() == 1 && "Expected only single user of the gather node."); - assert(VL.size() % NumParts == 0 && - "Number of scalars must be divisible by NumParts."); + // Number of scalars must be divisible by NumParts. + if (VL.size() % NumParts != 0) + return {}; unsigned SliceSize = getPartNumElems(VL.size(), NumParts); SmallVector> Res; for (unsigned Part : seq(NumParts)) { @@ -17005,7 +17042,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, for (unsigned I = NextInst; I < MaxInst; ++I) { unsigned ActualVF = std::min(MaxInst - I, VF); - if (!has_single_bit(ActualVF)) + if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF)) continue; if (MaxVFOnly && ActualVF < MaxVF) diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll index 281b5f99540ea..4074b8654362e 100644 --- a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll @@ -1,21 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s --check-prefix=RISCV ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-100 | FileCheck %s ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -slp-threshold=-100 | FileCheck %s ; REQUIRES: aarch64-registered-target, x86-registered-target, riscv-registered-target define i64 @test(ptr %p) { +; RISCV-LABEL: @test( +; RISCV-NEXT: entry: +; RISCV-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4 +; RISCV-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4 +; RISCV-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4 +; RISCV-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> +; RISCV-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0) +; RISCV-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4) +; RISCV-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], +; RISCV-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) +; RISCV-NEXT: ret i64 [[TMP6]] +; ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4) -; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) -; CHECK-NEXT: ret i64 [[TMP6]] +; CHECK-NEXT: [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]]) +; CHECK-NEXT: ret i64 [[TMP3]] ; entry: %arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1