-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands. #107273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Created using spr 1.3.5
|
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesPatch adds basic support for non-power-of-2 number of elements in Full diff: https://github.com/llvm/llvm-project/pull/107273.diff 3 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 50dc7d5c54c54a..67ded1e21c4835 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2531,7 +2531,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
unsigned getNumberOfParts(Type *Tp) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
- return LT.first.isValid() ? *LT.first.getValue() : 0;
+ if (!LT.first.isValid())
+ return 0;
+ // Try to find actual number of parts for non-power-of-2 elements as
+ // ceil(num-of-elements/num-of-subtype-elements).
+ if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
+ Tp && LT.second.isFixedLengthVector() &&
+ !has_single_bit(FTp->getNumElements())) {
+ if (auto *SubTp = dyn_cast_if_present<FixedVectorType>(
+ EVT(LT.second).getTypeForEVT(Tp->getContext()));
+ SubTp && SubTp->getElementType() == FTp->getElementType())
+ return divideCeil(FTp->getNumElements(), SubTp->getNumElements());
+ }
+ return *LT.first.getValue();
}
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 60476398e5ca75..3c647b36e98490 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
VF * getNumElements(ScalarTy));
}
+/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
+/// which forms type, which splits by \p TTI into whole vector types during
+/// legalization.
+static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
+ Type *Ty, unsigned Sz) {
+ if (!isValidElementType(Ty))
+ return bit_ceil(Sz);
+ // Find the number of elements, which forms full vectors.
+ const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
+ if (NumParts == 0 || NumParts >= Sz)
+ return bit_ceil(Sz);
+ return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
+}
+
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
SmallVectorImpl<int> &Mask) {
// The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -394,7 +408,7 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
/// total number of elements \p Size and number of registers (parts) \p
/// NumParts.
static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
- return PowerOf2Ceil(divideCeil(Size, NumParts));
+ return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
}
/// Returns correct remaining number of elements, considering total amount \p
@@ -1223,6 +1237,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
}
+/// Returns true if widened type of \p Ty elements with size \p Sz represents
+/// full vector type, i.e. adding extra element results in extra parts upon type
+/// legalization.
+static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty,
+ unsigned Sz) {
+ if (Sz <= 1)
+ return false;
+ if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
+ return false;
+ if (has_single_bit(Sz))
+ return true;
+ const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
+ return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
+ Sz % NumParts == 0;
+}
+
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
@@ -2466,7 +2496,9 @@ class BoUpSLP {
}
// TODO: Check if we can remove a check for non-power-2 number of
// scalars after full support of non-power-2 vectorization.
- return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
+ return UniqueValues.size() != 2 &&
+ hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(),
+ UniqueValues.size());
};
// If the initial strategy fails for any of the operand indexes, then we
@@ -3275,8 +3307,9 @@ class BoUpSLP {
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
/// Return true if this is a non-power-of-2 node.
- bool isNonPowOf2Vec() const {
- bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
+ bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
+ bool IsNonPowerOf2 = !hasFullVectorsOnly(
+ TTI, getValueType(Scalars.front()), Scalars.size());
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
"Reshuffling not supported with non-power-of-2 vectors yet.");
return IsNonPowerOf2;
@@ -3454,7 +3487,7 @@ class BoUpSLP {
if (UserTreeIdx.UserTE) {
Last->UserTreeIndices.push_back(UserTreeIdx);
- assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
+ assert((!Last->isNonPowOf2Vec(*TTI) || Last->ReorderIndices.empty()) &&
"Reordering isn't implemented for non-power-of-2 nodes yet");
}
return Last;
@@ -4732,7 +4765,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
- if (!Order.empty() && !has_single_bit(VL.size())) {
+ if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz)) {
assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
"supported with VectorizeNonPowerOf2");
return LoadsState::Gather;
@@ -4786,12 +4819,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
});
});
const unsigned AbsoluteDiff = std::abs(*Diff);
- if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
- ((Sz > MinProfitableStridedLoads ||
- (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
- has_single_bit(AbsoluteDiff))) &&
- AbsoluteDiff > Sz) ||
- *Diff == -(static_cast<int>(Sz) - 1))) {
+ if (IsPossibleStrided &&
+ (IsAnyPointerUsedOutGraph ||
+ ((Sz > MinProfitableStridedLoads ||
+ (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
+ hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) &&
+ AbsoluteDiff > Sz) ||
+ *Diff == -(static_cast<int>(Sz) - 1))) {
int Stride = *Diff / static_cast<int>(Sz - 1);
if (*Diff == Stride * static_cast<int>(Sz - 1)) {
Align Alignment =
@@ -5196,7 +5230,7 @@ static bool areTwoInsertFromSameBuildVector(
std::optional<BoUpSLP::OrdersType>
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
- if (TE.isNonPowOf2Vec())
+ if (TE.isNonPowOf2Vec(*TTI))
return std::nullopt;
// No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -5580,7 +5614,7 @@ void BoUpSLP::reorderTopToBottom() {
// Reorder the graph nodes according to their vectorization factor.
for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
- VF /= 2) {
+ VF -= 2) {
auto It = VFToOrderedEntries.find(VF);
if (It == VFToOrderedEntries.end())
continue;
@@ -5753,7 +5787,7 @@ bool BoUpSLP::canReorderOperands(
ArrayRef<TreeEntry *> ReorderableGathers,
SmallVectorImpl<TreeEntry *> &GatherOps) {
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
- if (UserTE->isNonPowOf2Vec())
+ if (UserTE->isNonPowOf2Vec(*TTI))
return false;
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
@@ -5928,7 +5962,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
const auto AllowsReordering = [&](const TreeEntry *TE) {
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
- if (TE->isNonPowOf2Vec())
+ if (TE->isNonPowOf2Vec(*TTI))
return false;
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -6581,7 +6615,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case Instruction::ExtractElement: {
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
- if (!has_single_bit(VL.size()))
+ if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size()))
return TreeEntry::NeedToGather;
if (Reuse || !CurrentOrder.empty())
return TreeEntry::Vectorize;
@@ -6987,24 +7021,25 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
UniqueValues.emplace_back(V);
}
size_t NumUniqueScalarValues = UniqueValues.size();
- if (NumUniqueScalarValues == VL.size()) {
+ bool IsFullVectors =
+ hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(),
+ NumUniqueScalarValues);
+ if (NumUniqueScalarValues == VL.size() &&
+ (VectorizeNonPowerOf2 || IsFullVectors)) {
ReuseShuffleIndices.clear();
} else {
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
- if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI)) {
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
"for nodes with padding.\n");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return false;
}
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
- if (NumUniqueScalarValues <= 1 ||
- (UniquePositions.size() == 1 && all_of(UniqueValues,
- [](Value *V) {
- return isa<UndefValue>(V) ||
- !isConstant(V);
- })) ||
- !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
+ if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
+ (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
+ return isa<UndefValue>(V) || !isConstant(V);
+ }))) {
if (DoNotFail && UniquePositions.size() > 1 &&
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
all_of(UniqueValues, [=](Value *V) {
@@ -7012,7 +7047,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
areAllUsersVectorized(cast<Instruction>(V),
UserIgnoreList);
})) {
- unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
+ // Find the number of elements, which forms full vectors.
+ unsigned PWSz = getFullVectorNumberOfElements(
+ *TTI, UniqueValues.front()->getType(), UniqueValues.size());
if (PWSz == VL.size()) {
ReuseShuffleIndices.clear();
} else {
@@ -9107,9 +9144,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
return nullptr;
Value *VecBase = nullptr;
ArrayRef<Value *> VL = E->Scalars;
- // If the resulting type is scalarized, do not adjust the cost.
- if (NumParts == VL.size())
- return nullptr;
// Check if it can be considered reused if same extractelements were
// vectorized already.
bool PrevNodeFound = any_of(
@@ -9762,7 +9796,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
InsertMask[Idx] = I + 1;
}
unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
- if (NumOfParts > 0)
+ if (NumOfParts > 0 && NumOfParts < NumElts)
VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
VecScalarsSz;
@@ -10991,7 +11025,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// Keep original scalar if number of externally used instructions in
// the same entry is not power of 2. It may help to do some extra
// vectorization for now.
- KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
+ KeepScalar =
+ ScalarUsesCount <= 1 ||
+ !hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount);
}
if (KeepScalar) {
ExternalUsesAsOriginalScalar.insert(EU.Scalar);
@@ -11688,13 +11724,14 @@ BoUpSLP::isGatherShuffledEntry(
if (TE == VectorizableTree.front().get())
return {};
// FIXME: Gathering for non-power-of-2 nodes not implemented yet.
- if (TE->isNonPowOf2Vec())
+ if (TE->isNonPowOf2Vec(*TTI))
return {};
Mask.assign(VL.size(), PoisonMaskElem);
assert(TE->UserTreeIndices.size() == 1 &&
"Expected only single user of the gather node.");
- assert(VL.size() % NumParts == 0 &&
- "Number of scalars must be divisible by NumParts.");
+ // Number of scalars must be divisible by NumParts.
+ if (VL.size() % NumParts != 0)
+ return {};
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
SmallVector<std::optional<TTI::ShuffleKind>> Res;
for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -17005,7 +17042,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
for (unsigned I = NextInst; I < MaxInst; ++I) {
unsigned ActualVF = std::min(MaxInst - I, VF);
- if (!has_single_bit(ActualVF))
+ if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF))
continue;
if (MaxVFOnly && ActualVF < MaxVF)
diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
index 281b5f99540eab..4074b8654362e0 100644
--- a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
@@ -1,21 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s --check-prefix=RISCV
; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-100 | FileCheck %s
; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -slp-threshold=-100 | FileCheck %s
; REQUIRES: aarch64-registered-target, x86-registered-target, riscv-registered-target
define i64 @test(ptr %p) {
+; RISCV-LABEL: @test(
+; RISCV-NEXT: entry:
+; RISCV-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
+; RISCV-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
+; RISCV-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
+; RISCV-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
+; RISCV-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
+; RISCV-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
+; RISCV-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
+; RISCV-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
+; RISCV-NEXT: ret i64 [[TMP6]]
+;
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
-; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
-; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
-; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
-; CHECK-NEXT: ret i64 [[TMP6]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
+; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
+; CHECK-NEXT: ret i64 [[TMP3]]
;
entry:
%arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
|
Can you explain how this version of the patch differs (profitability heuristic wise) from the previous? My style comment from the previous review (#106449) applies here as well. |
Created using spr 1.3.5
| (IsAnyPointerUsedOutGraph || | ||
| ((Sz > MinProfitableStridedLoads || | ||
| (AbsoluteDiff <= MaxProfitableLoadStride * Sz && | ||
| hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you're confusing two things here. The has_single_bit check was on the size (and thus stride), not on the element type. I think we can probably just relax this check to allow any stride >= Sz.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mistakenly replaced, when searched for has_single_bit function use
| // Reorder the graph nodes according to their vectorization factor. | ||
| for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1; | ||
| VF = bit_ceil(VF) / 2) { | ||
| VF = (VF & ~1U) - 2) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is this trying to do? & -1 should just be VF shouldn't it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If VF is odd (for 3, 7, etc. number of elements), want to try next even. I reworked it to VF += 2 - (VF & 1) to jump over even VFs
| assert(VL.size() % NumParts == 0 && | ||
| "Number of scalars must be divisible by NumParts."); | ||
| // Number of scalars must be divisible by NumParts. | ||
| if (VL.size() % NumParts != 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm reading over the existing NumParts code, and not sure I'm grasping what it's doing. Is the idea to cost manual splitting of the vector to a smaller legal vector type? If so, why is this not something that TTI should be doing internally?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, not vector type splitting, but finding subvectors, which can be shuffled
Created using spr 1.3.5
Created using spr 1.3.5
|
Ping! |
My previous question still stands. Can you explain how this version of the patch differs (profitability heuristic wise) from the previous? I'm not asking to be difficult. I'm asking because it's hard to review the change without knowing what your intention and reasoning is. |
It does not add any profitability checks. It just checks for the actual number of parts (registers), used in the codegen for the type, and tries to operate on the whole parts only. I'm going to make this mode (whole number of registers, but number of elements is non-power-of-2) default for the key instructions, when trying to build/analyze the graph. |
|
I think I understand, let me state this back to you to make sure. You're extending the non-power-of-two vectorization support with a different class of non-power-of-two vectors which may be profitable. (The previous class was VL=2^N-1.) The class being proposed is VL=VLMAX*N where VLMAX is the number of elements in a single vector register. You're planning to both allow this class as the initial VL, and also adjust the search order to try VLMAX boundaries instead of reducing by powers of two. This sounds reasonable. Let me go look at the code with this in mind. |
preames
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Meta comment)
You have several cases where you are relaxing a power-of-two guard for code only given the new VL=N*VLMAX class where I struggle to believe that if that code is correct, it's not also correct for VL=2^N-1. I strongly request that if you believe a piece of code is correct for a non-power-of-two vector, that you remove the guard in a separate commit which adds a test for that particular path.
Once you've done so, this patch becomes fairly trivial as it's just a new "type" of non-power-of-twos which can flow through.
| return false; | ||
| if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty)) | ||
| return false; | ||
| if (has_single_bit(Sz)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think there's any requirement that a power of two VL is a full register? Consider SEW=64, VL=2 on RISCV. With zvl256b this is at most half of a register. At zvl128b, it might be a full register (or might not.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cust currently SLP supports power-of-2 elements, no matter if it uses full registers or not. This check just checks for what we currently have in the compiler by default, it is not new requirement. I'll rename the function.
| // scalars after full support of non-power-2 vectorization. | ||
| return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size()); | ||
| return UniqueValues.size() != 2 && | ||
| hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of white listing only the new class here, can we remove the restriction in a separate patch? This should be fully VL independent right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently no, it will regress some code. I 'll restore an original check here and will update it later
| SmallVectorImpl<Value *> *AltScalars = nullptr) const; | ||
|
|
||
| /// Return true if this is a non-power-of-2 node. | ||
| bool isNonPowOf2Vec() const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Request: Please leave the original method, and add a new one called isAllowedNonPowOfTwo.
| // FIXME: Remove once support for ReuseShuffleIndices has been implemented | ||
| // for non-power-of-two vectors. | ||
| assert((has_single_bit(VL.size()) || ReuseShuffleIndices.empty()) && | ||
| assert((hasFullVectorsOnly(*TTI, getValueType(VL.front()), VL.size()) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have you audited the reordering code for VL=N*VLMAX? I suspect not as it's not trivial, and I'd expect it to require changes. If you have, we should be able to just remove this assert instead as it should work for any VL.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not reordering, this is reusing of the same scalars. The test checks it (see [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>)
| if (!TE.ReuseShuffleIndices.empty()) { | ||
| // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors. | ||
| assert(!TE.isNonPowOf2Vec() && | ||
| assert(!TE.hasNonWholeRegisterElems(*TTI) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same as above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Again, the test checks this
| // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars | ||
| // has been auditted for correctness with non-power-of-two vectors. | ||
| if (!TE.isNonPowOf2Vec()) | ||
| if (!TE.hasNonWholeRegisterElems(*TTI)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you've audited this for arbitrary VL, please just remove the check. (In it's own commit.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reordering requires some extra testing, reverting it for now
| bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); | ||
| // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. | ||
| if (!has_single_bit(VL.size())) | ||
| if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Again, have you audited the exposed code?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reverting it for now
I'll drop for now most of non-tested stuff, will include just some basic support. |
Yes. I plan to use this class mostly for initial VL only (balance compile time and vectorization quality), for other scalars in the graph plan to allow any non-power-of-2 value. |
Created using spr 1.3.5
|
For the record, LGTM because I don't spot a functional problem with the code, not because I'm convinced this is always going to be profitable w/o cost model updates. |
It is a conservative solution. Relies completely on the current cost model, which always rounds the number of elements up to power-of-2 |
|
This causes a crash when linking lencod from llvm-test-suite in ReleaseThinLTO configuration. Reverted for now in 29b92d0. |
…mber of elements in operands. Patch adds basic support for non-power-of-2 number of elements in operands. The patch still requires that this number addresses whole registers. Reviewers: RKSimon, preames Reviewed By: preames Pull Request: #107273
|
This still triggers failed asserts for me: typedef struct {
int *a[][4]
} b;
void *c;
int d;
void e() {
b *f = c;
int g, h = d;
g = 0;
for (; g < h; g++)
f->a[1][2][g] = f->a[1][3][g] = f->a[2][0][g] = f->a[2][1][g] =
f->a[2][2][g] = f->a[2][3][g] = f->a[3][0][g] = f->a[3][1][g] =
f->a[3][2][g] = 0;
} |
Fixed in 100fd0c |
|
Thanks! It seems like my builds pass now after that fix. |
|
Looks like there may be another crash: https://llvm.godbolt.org/z/azGPd145f |
Must be fixed in be6aed9 |
…mber of elements in operands. Patch adds basic support for non-power-of-2 number of elements in operands. The patch still requires that this number addresses whole registers. Reviewers: RKSimon, preames Reviewed By: preames Pull Request: llvm#107273
Yes thanks! |
Patch adds basic support for non-power-of-2 number of elements in
operands. The patch still requires that this number addresses whole
registers.