-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[SLP]Initial support for non-power-of-2 vectorization #151530
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[SLP]Initial support for non-power-of-2 vectorization #151530
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-systemz Author: Alexey Bataev (alexey-bataev) ChangesEnables non-power-of-2 vectorization within the SLP tree. The root nodes Patch is 266.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151530.diff 59 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 593868fb8811a..e2d10b69fbb0d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1856,8 +1856,10 @@ getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
if (NumParts == 0 || NumParts >= Limit)
return 1;
unsigned Sz = getNumElements(VecTy);
- if (NumParts >= Sz || Sz % NumParts != 0 ||
- !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
+ unsigned PWSz =
+ getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz);
+ if (NumParts >= Sz || PWSz % NumParts != 0 ||
+ !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts))
return 1;
return NumParts;
}
@@ -1994,6 +1996,9 @@ class BoUpSLP {
VectorizableTree.front()->getVectorFactor());
}
+ /// Returns true if the tree is a reduction tree.
+ bool isReductionTree() const { return UserIgnoreList; }
+
/// Builds external uses of the vectorized scalars, i.e. the list of
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
/// ExternallyUsedValues contains additional list of external uses to handle
@@ -2185,6 +2190,21 @@ class BoUpSLP {
unsigned *BestVF = nullptr,
bool TryRecursiveCheck = true) const;
+ /// Checks if the given array of vectorized values has the same node in the
+ /// tree.
+ bool hasSameNode(const InstructionsState &S, ArrayRef<Value *> VL) const {
+ if (S) {
+ if (any_of(getTreeEntries(S.getMainOp()),
+ [&](const TreeEntry *TE) { return TE->isSame(VL); }))
+ return true;
+ return any_of(ValueToGatherNodes.lookup(S.getMainOp()),
+ [&](const TreeEntry *TE) { return TE->isSame(VL); });
+ }
+ return any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isGather() && TE->isSame(VL);
+ });
+ }
+
/// Registers non-vectorizable sequence of loads
template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
@@ -3224,11 +3244,7 @@ class BoUpSLP {
}))
return false;
}
- // TODO: Check if we can remove a check for non-power-2 number of
- // scalars after full support of non-power-2 vectorization.
- return UniqueValues.size() != 2 &&
- hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
- UniqueValues.size());
+ return UniqueValues.size() != 2;
};
// If the initial strategy fails for any of the operand indexes, then we
@@ -3663,8 +3679,8 @@ class BoUpSLP {
std::optional<TargetTransformInfo::ShuffleKind>
isGatherShuffledSingleRegisterEntry(
const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
- SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
- bool ForOrder);
+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder,
+ unsigned SliceSize);
/// Checks if the gathered \p VL can be represented as multi-register
/// shuffle(s) of previous tree entries.
@@ -4055,17 +4071,6 @@ class BoUpSLP {
return IsNonPowerOf2;
}
- /// Return true if this is a node, which tries to vectorize number of
- /// elements, forming whole vectors.
- bool
- hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
- bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
- TTI, getValueType(Scalars.front()), Scalars.size());
- assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
- "Reshuffling not supported with non-power-of-2 vectors yet.");
- return IsNonPowerOf2;
- }
-
Value *getOrdered(unsigned Idx) const {
assert(isGather() && "Must be used only for buildvectors/gathers.");
if (ReorderIndices.empty())
@@ -4222,12 +4227,6 @@ class BoUpSLP {
if (UserTreeIdx.UserTE)
OperandsToTreeEntry.try_emplace(
std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
- // FIXME: Remove once support for ReuseShuffleIndices has been implemented
- // for non-power-of-two vectors.
- assert(
- (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
- ReuseShuffleIndices.empty()) &&
- "Reshuffling scalars not yet supported for nodes with padding");
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
if (ReorderIndices.empty()) {
@@ -4386,21 +4385,16 @@ class BoUpSLP {
class ScalarsVectorizationLegality {
InstructionsState S;
bool IsLegal;
- bool TryToFindDuplicates;
bool TrySplitVectorize;
public:
ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
- bool TryToFindDuplicates = true,
bool TrySplitVectorize = false)
- : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
- TrySplitVectorize(TrySplitVectorize) {
- assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
- "Inconsistent state");
+ : S(S), IsLegal(IsLegal), TrySplitVectorize(TrySplitVectorize) {
+ assert((!IsLegal || S.valid()) && "Inconsistent state");
}
const InstructionsState &getInstructionsState() const { return S; };
bool isLegal() const { return IsLegal; }
- bool tryToFindDuplicates() const { return TryToFindDuplicates; }
bool trySplitVectorize() const { return TrySplitVectorize; }
};
@@ -5567,7 +5561,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
ArrayRef<int> Mask, int PartSz, int NumParts,
function_ref<unsigned(unsigned)> GetVF) {
- for (int I : seq<int>(0, NumParts)) {
+ for (int I : seq<int>(NumParts)) {
if (ShuffledSubMasks.test(I))
continue;
const int VF = GetVF(I);
@@ -5618,6 +5612,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
SecondVecFound = true;
break;
}
+ if (static_cast<unsigned>(I * PartSz + Idx) >= CurrentOrder.size())
+ break;
if (CurrentOrder[I * PartSz + Idx] >
static_cast<unsigned>(I * PartSz + K) &&
CurrentOrder[I * PartSz + Idx] !=
@@ -5636,12 +5632,14 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
if (!ExtractShuffles.empty())
TransformMaskToOrder(
CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
- if (!ExtractShuffles[I])
+ if (I >= ExtractShuffles.size() || !ExtractShuffles[I])
return 0U;
unsigned VF = 0;
unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
for (unsigned Idx : seq<unsigned>(Sz)) {
int K = I * PartSz + Idx;
+ if (static_cast<unsigned>(K) >= ExtractMask.size())
+ break;
if (ExtractMask[K] == PoisonMaskElem)
continue;
if (!TE.ReuseShuffleIndices.empty())
@@ -5669,7 +5667,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
}
if (!Entries.empty())
TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
- if (!GatherShuffles[I])
+ if (I >= GatherShuffles.size() || !GatherShuffles[I])
return 0U;
return std::max(Entries[I].front()->getVectorFactor(),
Entries[I].back()->getVectorFactor());
@@ -6381,12 +6379,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
if (!TryRecursiveCheck || VL.size() < ListLimit)
return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
- // FIXME: The following code has not been updated for non-power-of-2
- // vectors (and not whole registers). The splitting logic here does not
- // cover the original vector if the vector factor is not a power of two.
- if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
- return false;
-
unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
unsigned MinVF = getMinVF(2 * Sz);
DemandedElts.clearAllBits();
@@ -6397,8 +6389,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
VF >= MinVF;
VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
SmallVector<LoadsState> States;
- for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
- ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+ for (unsigned Cnt = 0, End = VL.size(); Cnt < End; Cnt += VF) {
+ ArrayRef<Value *> Slice = VL.slice(Cnt, std::min(VF, End - Cnt));
SmallVector<unsigned> Order;
SmallVector<Value *> PointerOps;
LoadsState LS =
@@ -6410,7 +6402,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
DemandedElts.setAllBits();
break;
}
- DemandedElts.setBits(Cnt, Cnt + VF);
+ DemandedElts.setBits(Cnt, Cnt + Slice.size());
continue;
}
// If need the reorder - consider as high-cost masked gather for now.
@@ -6436,13 +6428,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
VecLdCost +=
TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
}
- auto *SubVecTy = getWidenedType(ScalarTy, VF);
for (auto [I, LS] : enumerate(States)) {
+ const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - I * VF);
+ auto *SubVecTy = getWidenedType(ScalarTy, SliceVF);
auto *LI0 = cast<LoadInst>(VL[I * VF]);
InstructionCost VectorGEPCost =
(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
? 0
- : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
+ : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, SliceVF),
LI0->getPointerOperand(),
Instruction::GetElementPtr, CostKind, ScalarTy,
SubVecTy)
@@ -6456,12 +6449,12 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
getUnderlyingObject(PointerOps.front());
}))
VectorGEPCost += getScalarizationOverhead(
- TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
+ TTI, ScalarTy, SubVecTy, APInt::getAllOnes(SliceVF),
/*Insert=*/true, /*Extract=*/false, CostKind);
else
VectorGEPCost +=
getScalarizationOverhead(
- TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
+ TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(SliceVF, 0),
/*Insert=*/true, /*Extract=*/false, CostKind) +
::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
CostKind);
@@ -6501,7 +6494,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
continue;
}
SmallVector<int> ShuffleMask(VL.size());
- for (int Idx : seq<int>(0, VL.size()))
+ for (int Idx : seq<int>(VL.size()))
ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
if (I > 0)
VecLdCost +=
@@ -6740,10 +6733,6 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE.ReuseShuffleIndices.empty()) {
- // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
- assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
- "Reshuffling scalars not yet supported for nodes with padding");
-
if (isSplat(TE.Scalars))
return std::nullopt;
// Check if reuse shuffle indices can be improved by reordering.
@@ -7082,12 +7071,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
Res == LoadsState::CompressVectorize)
return std::move(CurrentOrder);
}
- // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
- // has been auditted for correctness with non-power-of-two vectors.
- if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
- if (std::optional<OrdersType> CurrentOrder =
- findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
- return CurrentOrder;
+ if (std::optional<OrdersType> CurrentOrder =
+ findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
+ return CurrentOrder;
}
return std::nullopt;
}
@@ -7338,7 +7324,7 @@ void BoUpSLP::reorderTopToBottom() {
// Reorder the graph nodes according to their vectorization factor.
for (unsigned VF = VectorizableTree.front()->getVectorFactor();
- !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
+ !VFToOrderedEntries.empty() && VF > 1; --VF) {
auto It = VFToOrderedEntries.find(VF);
if (It == VFToOrderedEntries.end())
continue;
@@ -8530,17 +8516,15 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
AllowToVectorize = CheckIfAllowed(Slice);
} else {
AllowToVectorize =
- (NumElts >= 3 ||
- any_of(ValueToGatherNodes.at(Slice.front()),
- [=](const TreeEntry *TE) {
- return TE->Scalars.size() == 2 &&
- ((TE->Scalars.front() == Slice.front() &&
- TE->Scalars.back() == Slice.back()) ||
- (TE->Scalars.front() == Slice.back() &&
- TE->Scalars.back() == Slice.front()));
- })) &&
- hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
- Slice.size());
+ NumElts >= 3 ||
+ any_of(ValueToGatherNodes.at(Slice.front()),
+ [=](const TreeEntry *TE) {
+ return TE->Scalars.size() == 2 &&
+ ((TE->Scalars.front() == Slice.front() &&
+ TE->Scalars.back() == Slice.back()) ||
+ (TE->Scalars.front() == Slice.back() &&
+ TE->Scalars.back() == Slice.front()));
+ });
}
if (AllowToVectorize) {
SmallVector<Value *> PointerOps;
@@ -9194,10 +9178,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
[[fallthrough]];
case Instruction::ExtractValue: {
bool Reuse = canReuseExtract(VL, CurrentOrder);
- // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
- // non-full registers).
- if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
- return TreeEntry::NeedToGather;
if (Reuse || !CurrentOrder.empty())
return TreeEntry::Vectorize;
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -9705,7 +9685,7 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
const TargetLibraryInfo &TLI,
const InstructionsState &S,
const BoUpSLP::EdgeInfo &UserTreeIdx,
- bool TryPad = false) {
+ const BoUpSLP &R, bool BuildGatherOnly = true) {
// Check that every instruction appears once in this bundle.
SmallVector<Value *> UniqueValues;
SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
@@ -9726,66 +9706,151 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
// Easy case: VL has unique values and a "natural" size
size_t NumUniqueScalarValues = UniqueValues.size();
- bool IsFullVectors = hasFullVectorsOrPowerOf2(
- TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
- if (NumUniqueScalarValues == VL.size() &&
- (VectorizeNonPowerOf2 || IsFullVectors)) {
+ if (NumUniqueScalarValues == VL.size()) {
ReuseShuffleIndices.clear();
return true;
}
+ bool AreAllValuesNonConst = UniquePositions.size() == NumUniqueScalarValues;
+
+ // Check if we need to schedule the scalars. If no, can keep original scalars
+ // and avoid extra shuffles.
+ bool RequireScheduling = S && S.getOpcode() != Instruction::PHI &&
+ !isVectorLikeInstWithConstOps(S.getMainOp()) &&
+ (S.areInstructionsWithCopyableElements() ||
+ !doesNotNeedToSchedule(UniqueValues));
+ // Drop tail poisons, if the values can be vectorized.
+ if (RequireScheduling) {
+ const auto EndIt =
+ find_if_not(make_range(UniqueValues.rbegin(), UniqueValues.rend()),
+ IsaPred<PoisonValue>);
+ assert(EndIt != UniqueValues.rend() && "Expected at least one non-poison.");
+ UniqueValues.erase(EndIt.base(), UniqueValues.end());
+ NumUniqueScalarValues = UniqueValues.size();
+ }
+
+ // Checks if unique inserts + shuffle is more profitable than just inserts or
+ // vectorized values.
+ auto EstimatePackPlusShuffleVsInserts = [&]() {
+ // Single instruction/argument insert - no shuffle.
+ if (UniquePositions.size() == 1 &&
+ (NumUniqueScalarValues == 1 ||
+ all_of(UniqueValues, IsaPred<UndefValue, Instruction, Argument>)))
+ return std::make_pair(false, false);
+ // Check if the given list of loads can be effectively vectorized.
+ auto CheckLoads = [&](ArrayRef<Value *> VL, bool IncludeGather) {
+ assert(S && S.getOpcode() == Instruction::Load && "Expected load.");
+ BoUpSLP::OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ // Modified loads are gathered - use the original loads, result is the
+ // same, but cheaper, no shuffle.
+ BoUpSLP::LoadsState Res =
+ R.canVectorizeLoads(VL, S.getMainOp(), Order, PointerOps);
+ return (IncludeGather && Res == BoUpSLP::LoadsState::Gather) ||
+ Res == BoUpSLP::LoadsState::ScatterVectorize;
+ };
+ // If the scalars are the operands of the root node - try to vectorize them
+ // with shuffles, otherwise we end up with the gather node, which may be
+ // non-profitable/small-tree for the vectorization.
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->Idx == 0 &&
+ !BuildGatherOnly) {
+ if (S && S.getOpcode() == Instruction::Load) {
+ // Modified loads are gathered - use the original loads, result is the
+ // same, but cheaper, no shuffle.
+ return std::make_pair(
+ true, CheckLoads(UniqueValues, /*IncludeGather=*/true) &&
+ CheckLoads(VL, /*IncludeGather=*/false));
+ }
+ return std::make_pair(true, !RequireScheduling);
+ }
+ // Mark unique scalars, to be gathered/buildvectorized.
+ APInt DemandedElts = APInt::getZero(VL.size());
+ for_each(enumerate(ReuseShuffleIndices), [&](const auto &P) {
+ // Do not include constants.
+ if (P.value() != PoisonMaskElem &&
+ UniquePositions.contains(UniqueValues[P.value()]))
+ DemandedElts.setBit(P.index());
+ });
+ Type *ScalarTy = UniqueValues.front()->getType();
+ auto *VecTy = getWidenedType(ScalarTy, VL.size());
+ auto *UniquesVecTy = getWidenedType(ScalarTy, NumUniqueScalarValues);
+ // No need to schedule scalars and only single register used? Use original
+ // scalars, do not pack.
+ if (!RequireScheduling) {
+ const unsigned NumParts = ::getNumberOfParts(TTI, VecTy);
+ if (VL.size() / NumUniqueScalarValues == 1 &&
+ (NumParts <= 1 || ::getNumberOfParts(TTI, UniquesVecTy) >= NumParts))
+ return std::make_pair(true, true);
+ }
+ // Check if unique loads more profitable than repeated loads.
+ if (S && S.getOpcode() == Instruction::Load) {
+ bool UniquesVectorized =
+ !CheckLoads(UniqueValues, /*IncludeGather=*/true);
+ if (UniquesVectorized || CheckLoads(VL, /*IncludeGather=...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesEnables non-power-of-2 vectorization within the SLP tree. The root nodes Patch is 266.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151530.diff 59 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 593868fb8811a..e2d10b69fbb0d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1856,8 +1856,10 @@ getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
if (NumParts == 0 || NumParts >= Limit)
return 1;
unsigned Sz = getNumElements(VecTy);
- if (NumParts >= Sz || Sz % NumParts != 0 ||
- !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
+ unsigned PWSz =
+ getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz);
+ if (NumParts >= Sz || PWSz % NumParts != 0 ||
+ !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts))
return 1;
return NumParts;
}
@@ -1994,6 +1996,9 @@ class BoUpSLP {
VectorizableTree.front()->getVectorFactor());
}
+ /// Returns true if the tree is a reduction tree.
+ bool isReductionTree() const { return UserIgnoreList; }
+
/// Builds external uses of the vectorized scalars, i.e. the list of
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
/// ExternallyUsedValues contains additional list of external uses to handle
@@ -2185,6 +2190,21 @@ class BoUpSLP {
unsigned *BestVF = nullptr,
bool TryRecursiveCheck = true) const;
+ /// Checks if the given array of vectorized values has the same node in the
+ /// tree.
+ bool hasSameNode(const InstructionsState &S, ArrayRef<Value *> VL) const {
+ if (S) {
+ if (any_of(getTreeEntries(S.getMainOp()),
+ [&](const TreeEntry *TE) { return TE->isSame(VL); }))
+ return true;
+ return any_of(ValueToGatherNodes.lookup(S.getMainOp()),
+ [&](const TreeEntry *TE) { return TE->isSame(VL); });
+ }
+ return any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isGather() && TE->isSame(VL);
+ });
+ }
+
/// Registers non-vectorizable sequence of loads
template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
@@ -3224,11 +3244,7 @@ class BoUpSLP {
}))
return false;
}
- // TODO: Check if we can remove a check for non-power-2 number of
- // scalars after full support of non-power-2 vectorization.
- return UniqueValues.size() != 2 &&
- hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
- UniqueValues.size());
+ return UniqueValues.size() != 2;
};
// If the initial strategy fails for any of the operand indexes, then we
@@ -3663,8 +3679,8 @@ class BoUpSLP {
std::optional<TargetTransformInfo::ShuffleKind>
isGatherShuffledSingleRegisterEntry(
const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
- SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
- bool ForOrder);
+ SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder,
+ unsigned SliceSize);
/// Checks if the gathered \p VL can be represented as multi-register
/// shuffle(s) of previous tree entries.
@@ -4055,17 +4071,6 @@ class BoUpSLP {
return IsNonPowerOf2;
}
- /// Return true if this is a node, which tries to vectorize number of
- /// elements, forming whole vectors.
- bool
- hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
- bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
- TTI, getValueType(Scalars.front()), Scalars.size());
- assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
- "Reshuffling not supported with non-power-of-2 vectors yet.");
- return IsNonPowerOf2;
- }
-
Value *getOrdered(unsigned Idx) const {
assert(isGather() && "Must be used only for buildvectors/gathers.");
if (ReorderIndices.empty())
@@ -4222,12 +4227,6 @@ class BoUpSLP {
if (UserTreeIdx.UserTE)
OperandsToTreeEntry.try_emplace(
std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
- // FIXME: Remove once support for ReuseShuffleIndices has been implemented
- // for non-power-of-two vectors.
- assert(
- (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
- ReuseShuffleIndices.empty()) &&
- "Reshuffling scalars not yet supported for nodes with padding");
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
if (ReorderIndices.empty()) {
@@ -4386,21 +4385,16 @@ class BoUpSLP {
class ScalarsVectorizationLegality {
InstructionsState S;
bool IsLegal;
- bool TryToFindDuplicates;
bool TrySplitVectorize;
public:
ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
- bool TryToFindDuplicates = true,
bool TrySplitVectorize = false)
- : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
- TrySplitVectorize(TrySplitVectorize) {
- assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
- "Inconsistent state");
+ : S(S), IsLegal(IsLegal), TrySplitVectorize(TrySplitVectorize) {
+ assert((!IsLegal || S.valid()) && "Inconsistent state");
}
const InstructionsState &getInstructionsState() const { return S; };
bool isLegal() const { return IsLegal; }
- bool tryToFindDuplicates() const { return TryToFindDuplicates; }
bool trySplitVectorize() const { return TrySplitVectorize; }
};
@@ -5567,7 +5561,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
ArrayRef<int> Mask, int PartSz, int NumParts,
function_ref<unsigned(unsigned)> GetVF) {
- for (int I : seq<int>(0, NumParts)) {
+ for (int I : seq<int>(NumParts)) {
if (ShuffledSubMasks.test(I))
continue;
const int VF = GetVF(I);
@@ -5618,6 +5612,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
SecondVecFound = true;
break;
}
+ if (static_cast<unsigned>(I * PartSz + Idx) >= CurrentOrder.size())
+ break;
if (CurrentOrder[I * PartSz + Idx] >
static_cast<unsigned>(I * PartSz + K) &&
CurrentOrder[I * PartSz + Idx] !=
@@ -5636,12 +5632,14 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
if (!ExtractShuffles.empty())
TransformMaskToOrder(
CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
- if (!ExtractShuffles[I])
+ if (I >= ExtractShuffles.size() || !ExtractShuffles[I])
return 0U;
unsigned VF = 0;
unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
for (unsigned Idx : seq<unsigned>(Sz)) {
int K = I * PartSz + Idx;
+ if (static_cast<unsigned>(K) >= ExtractMask.size())
+ break;
if (ExtractMask[K] == PoisonMaskElem)
continue;
if (!TE.ReuseShuffleIndices.empty())
@@ -5669,7 +5667,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
}
if (!Entries.empty())
TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
- if (!GatherShuffles[I])
+ if (I >= GatherShuffles.size() || !GatherShuffles[I])
return 0U;
return std::max(Entries[I].front()->getVectorFactor(),
Entries[I].back()->getVectorFactor());
@@ -6381,12 +6379,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
if (!TryRecursiveCheck || VL.size() < ListLimit)
return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
- // FIXME: The following code has not been updated for non-power-of-2
- // vectors (and not whole registers). The splitting logic here does not
- // cover the original vector if the vector factor is not a power of two.
- if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
- return false;
-
unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
unsigned MinVF = getMinVF(2 * Sz);
DemandedElts.clearAllBits();
@@ -6397,8 +6389,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
VF >= MinVF;
VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
SmallVector<LoadsState> States;
- for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
- ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+ for (unsigned Cnt = 0, End = VL.size(); Cnt < End; Cnt += VF) {
+ ArrayRef<Value *> Slice = VL.slice(Cnt, std::min(VF, End - Cnt));
SmallVector<unsigned> Order;
SmallVector<Value *> PointerOps;
LoadsState LS =
@@ -6410,7 +6402,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
DemandedElts.setAllBits();
break;
}
- DemandedElts.setBits(Cnt, Cnt + VF);
+ DemandedElts.setBits(Cnt, Cnt + Slice.size());
continue;
}
// If need the reorder - consider as high-cost masked gather for now.
@@ -6436,13 +6428,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
VecLdCost +=
TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
}
- auto *SubVecTy = getWidenedType(ScalarTy, VF);
for (auto [I, LS] : enumerate(States)) {
+ const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - I * VF);
+ auto *SubVecTy = getWidenedType(ScalarTy, SliceVF);
auto *LI0 = cast<LoadInst>(VL[I * VF]);
InstructionCost VectorGEPCost =
(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
? 0
- : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
+ : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, SliceVF),
LI0->getPointerOperand(),
Instruction::GetElementPtr, CostKind, ScalarTy,
SubVecTy)
@@ -6456,12 +6449,12 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
getUnderlyingObject(PointerOps.front());
}))
VectorGEPCost += getScalarizationOverhead(
- TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
+ TTI, ScalarTy, SubVecTy, APInt::getAllOnes(SliceVF),
/*Insert=*/true, /*Extract=*/false, CostKind);
else
VectorGEPCost +=
getScalarizationOverhead(
- TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
+ TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(SliceVF, 0),
/*Insert=*/true, /*Extract=*/false, CostKind) +
::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
CostKind);
@@ -6501,7 +6494,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
continue;
}
SmallVector<int> ShuffleMask(VL.size());
- for (int Idx : seq<int>(0, VL.size()))
+ for (int Idx : seq<int>(VL.size()))
ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
if (I > 0)
VecLdCost +=
@@ -6740,10 +6733,6 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE.ReuseShuffleIndices.empty()) {
- // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
- assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
- "Reshuffling scalars not yet supported for nodes with padding");
-
if (isSplat(TE.Scalars))
return std::nullopt;
// Check if reuse shuffle indices can be improved by reordering.
@@ -7082,12 +7071,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
Res == LoadsState::CompressVectorize)
return std::move(CurrentOrder);
}
- // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
- // has been auditted for correctness with non-power-of-two vectors.
- if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
- if (std::optional<OrdersType> CurrentOrder =
- findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
- return CurrentOrder;
+ if (std::optional<OrdersType> CurrentOrder =
+ findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
+ return CurrentOrder;
}
return std::nullopt;
}
@@ -7338,7 +7324,7 @@ void BoUpSLP::reorderTopToBottom() {
// Reorder the graph nodes according to their vectorization factor.
for (unsigned VF = VectorizableTree.front()->getVectorFactor();
- !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
+ !VFToOrderedEntries.empty() && VF > 1; --VF) {
auto It = VFToOrderedEntries.find(VF);
if (It == VFToOrderedEntries.end())
continue;
@@ -8530,17 +8516,15 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
AllowToVectorize = CheckIfAllowed(Slice);
} else {
AllowToVectorize =
- (NumElts >= 3 ||
- any_of(ValueToGatherNodes.at(Slice.front()),
- [=](const TreeEntry *TE) {
- return TE->Scalars.size() == 2 &&
- ((TE->Scalars.front() == Slice.front() &&
- TE->Scalars.back() == Slice.back()) ||
- (TE->Scalars.front() == Slice.back() &&
- TE->Scalars.back() == Slice.front()));
- })) &&
- hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
- Slice.size());
+ NumElts >= 3 ||
+ any_of(ValueToGatherNodes.at(Slice.front()),
+ [=](const TreeEntry *TE) {
+ return TE->Scalars.size() == 2 &&
+ ((TE->Scalars.front() == Slice.front() &&
+ TE->Scalars.back() == Slice.back()) ||
+ (TE->Scalars.front() == Slice.back() &&
+ TE->Scalars.back() == Slice.front()));
+ });
}
if (AllowToVectorize) {
SmallVector<Value *> PointerOps;
@@ -9194,10 +9178,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
[[fallthrough]];
case Instruction::ExtractValue: {
bool Reuse = canReuseExtract(VL, CurrentOrder);
- // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
- // non-full registers).
- if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
- return TreeEntry::NeedToGather;
if (Reuse || !CurrentOrder.empty())
return TreeEntry::Vectorize;
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -9705,7 +9685,7 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
const TargetLibraryInfo &TLI,
const InstructionsState &S,
const BoUpSLP::EdgeInfo &UserTreeIdx,
- bool TryPad = false) {
+ const BoUpSLP &R, bool BuildGatherOnly = true) {
// Check that every instruction appears once in this bundle.
SmallVector<Value *> UniqueValues;
SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
@@ -9726,66 +9706,151 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
// Easy case: VL has unique values and a "natural" size
size_t NumUniqueScalarValues = UniqueValues.size();
- bool IsFullVectors = hasFullVectorsOrPowerOf2(
- TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
- if (NumUniqueScalarValues == VL.size() &&
- (VectorizeNonPowerOf2 || IsFullVectors)) {
+ if (NumUniqueScalarValues == VL.size()) {
ReuseShuffleIndices.clear();
return true;
}
+ bool AreAllValuesNonConst = UniquePositions.size() == NumUniqueScalarValues;
+
+ // Check if we need to schedule the scalars. If no, can keep original scalars
+ // and avoid extra shuffles.
+ bool RequireScheduling = S && S.getOpcode() != Instruction::PHI &&
+ !isVectorLikeInstWithConstOps(S.getMainOp()) &&
+ (S.areInstructionsWithCopyableElements() ||
+ !doesNotNeedToSchedule(UniqueValues));
+ // Drop tail poisons, if the values can be vectorized.
+ if (RequireScheduling) {
+ const auto EndIt =
+ find_if_not(make_range(UniqueValues.rbegin(), UniqueValues.rend()),
+ IsaPred<PoisonValue>);
+ assert(EndIt != UniqueValues.rend() && "Expected at least one non-poison.");
+ UniqueValues.erase(EndIt.base(), UniqueValues.end());
+ NumUniqueScalarValues = UniqueValues.size();
+ }
+
+ // Checks if unique inserts + shuffle is more profitable than just inserts or
+ // vectorized values.
+ auto EstimatePackPlusShuffleVsInserts = [&]() {
+ // Single instruction/argument insert - no shuffle.
+ if (UniquePositions.size() == 1 &&
+ (NumUniqueScalarValues == 1 ||
+ all_of(UniqueValues, IsaPred<UndefValue, Instruction, Argument>)))
+ return std::make_pair(false, false);
+ // Check if the given list of loads can be effectively vectorized.
+ auto CheckLoads = [&](ArrayRef<Value *> VL, bool IncludeGather) {
+ assert(S && S.getOpcode() == Instruction::Load && "Expected load.");
+ BoUpSLP::OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ // Modified loads are gathered - use the original loads, result is the
+ // same, but cheaper, no shuffle.
+ BoUpSLP::LoadsState Res =
+ R.canVectorizeLoads(VL, S.getMainOp(), Order, PointerOps);
+ return (IncludeGather && Res == BoUpSLP::LoadsState::Gather) ||
+ Res == BoUpSLP::LoadsState::ScatterVectorize;
+ };
+ // If the scalars are the operands of the root node - try to vectorize them
+ // with shuffles, otherwise we end up with the gather node, which may be
+ // non-profitable/small-tree for the vectorization.
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->Idx == 0 &&
+ !BuildGatherOnly) {
+ if (S && S.getOpcode() == Instruction::Load) {
+ // Modified loads are gathered - use the original loads, result is the
+ // same, but cheaper, no shuffle.
+ return std::make_pair(
+ true, CheckLoads(UniqueValues, /*IncludeGather=*/true) &&
+ CheckLoads(VL, /*IncludeGather=*/false));
+ }
+ return std::make_pair(true, !RequireScheduling);
+ }
+ // Mark unique scalars, to be gathered/buildvectorized.
+ APInt DemandedElts = APInt::getZero(VL.size());
+ for_each(enumerate(ReuseShuffleIndices), [&](const auto &P) {
+ // Do not include constants.
+ if (P.value() != PoisonMaskElem &&
+ UniquePositions.contains(UniqueValues[P.value()]))
+ DemandedElts.setBit(P.index());
+ });
+ Type *ScalarTy = UniqueValues.front()->getType();
+ auto *VecTy = getWidenedType(ScalarTy, VL.size());
+ auto *UniquesVecTy = getWidenedType(ScalarTy, NumUniqueScalarValues);
+ // No need to schedule scalars and only single register used? Use original
+ // scalars, do not pack.
+ if (!RequireScheduling) {
+ const unsigned NumParts = ::getNumberOfParts(TTI, VecTy);
+ if (VL.size() / NumUniqueScalarValues == 1 &&
+ (NumParts <= 1 || ::getNumberOfParts(TTI, UniquesVecTy) >= NumParts))
+ return std::make_pair(true, true);
+ }
+ // Check if unique loads more profitable than repeated loads.
+ if (S && S.getOpcode() == Instruction::Load) {
+ bool UniquesVectorized =
+ !CheckLoads(UniqueValues, /*IncludeGather=*/true);
+ if (UniquesVectorized || CheckLoads(VL, /*IncludeGather=...
[truncated]
|
Ping! |
Enables non-power-of-2 vectorization within the SLP tree. The root nodes
are still required to be power-of-2, will be addressed in a follow-up
patches.