[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands. #107273

alexey-bataev · 2024-09-04T17:27:58Z

Patch adds basic support for non-power-of-2 number of elements in
operands. The patch still requires that this number addresses whole
registers.

Created using spr 1.3.5

llvmbot · 2024-09-04T17:28:30Z

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Patch adds basic support for non-power-of-2 number of elements in
operands. The patch still requires that this number addresses whole
registers.

Full diff: https://github.com/llvm/llvm-project/pull/107273.diff

3 Files Affected:

(modified) llvm/include/llvm/CodeGen/BasicTTIImpl.h (+13-1)
(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+73-36)
(modified) llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll (+18-10)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 50dc7d5c54c54a..67ded1e21c4835 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2531,7 +2531,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
   unsigned getNumberOfParts(Type *Tp) {
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
-    return LT.first.isValid() ? *LT.first.getValue() : 0;
+    if (!LT.first.isValid())
+      return 0;
+    // Try to find actual number of parts for non-power-of-2 elements as
+    // ceil(num-of-elements/num-of-subtype-elements).
+    if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
+        Tp && LT.second.isFixedLengthVector() &&
+        !has_single_bit(FTp->getNumElements())) {
+      if (auto *SubTp = dyn_cast_if_present<FixedVectorType>(
+              EVT(LT.second).getTypeForEVT(Tp->getContext()));
+          SubTp && SubTp->getElementType() == FTp->getElementType())
+        return divideCeil(FTp->getNumElements(), SubTp->getNumElements());
+    }
+    return *LT.first.getValue();
   }
 
   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 60476398e5ca75..3c647b36e98490 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
                               VF * getNumElements(ScalarTy));
 }
 
+/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
+/// which forms type, which splits by \p TTI into whole vector types during
+/// legalization.
+static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
+                                              Type *Ty, unsigned Sz) {
+  if (!isValidElementType(Ty))
+    return bit_ceil(Sz);
+  // Find the number of elements, which forms full vectors.
+  const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
+  if (NumParts == 0 || NumParts >= Sz)
+    return bit_ceil(Sz);
+  return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
+}
+
 static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
                                                    SmallVectorImpl<int> &Mask) {
   // The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -394,7 +408,7 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
 /// total number of elements \p Size and number of registers (parts) \p
 /// NumParts.
 static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
-  return PowerOf2Ceil(divideCeil(Size, NumParts));
+  return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
 }
 
 /// Returns correct remaining number of elements, considering total amount \p
@@ -1223,6 +1237,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
          (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
 }
 
+/// Returns true if widened type of \p Ty elements with size \p Sz represents
+/// full vector type, i.e. adding extra element results in extra parts upon type
+/// legalization.
+static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty,
+                               unsigned Sz) {
+  if (Sz <= 1)
+    return false;
+  if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
+    return false;
+  if (has_single_bit(Sz))
+    return true;
+  const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
+  return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
+         Sz % NumParts == 0;
+}
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -2466,7 +2496,9 @@ class BoUpSLP {
         }
         // TODO: Check if we can remove a check for non-power-2 number of
         // scalars after full support of non-power-2 vectorization.
-        return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
+        return UniqueValues.size() != 2 &&
+               hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(),
+                                  UniqueValues.size());
       };
 
       // If the initial strategy fails for any of the operand indexes, then we
@@ -3275,8 +3307,9 @@ class BoUpSLP {
                           SmallVectorImpl<Value *> *AltScalars = nullptr) const;
 
     /// Return true if this is a non-power-of-2 node.
-    bool isNonPowOf2Vec() const {
-      bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
+    bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
+      bool IsNonPowerOf2 = !hasFullVectorsOnly(
+          TTI, getValueType(Scalars.front()), Scalars.size());
       assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
              "Reshuffling not supported with non-power-of-2 vectors yet.");
       return IsNonPowerOf2;
@@ -3454,7 +3487,7 @@ class BoUpSLP {
 
     if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
-      assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
+      assert((!Last->isNonPowOf2Vec(*TTI) || Last->ReorderIndices.empty()) &&
              "Reordering isn't implemented for non-power-of-2 nodes yet");
     }
     return Last;
@@ -4732,7 +4765,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   // Check the order of pointer operands or that all pointers are the same.
   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
   // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-  if (!Order.empty() && !has_single_bit(VL.size())) {
+  if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz)) {
     assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
                                    "supported with VectorizeNonPowerOf2");
     return LoadsState::Gather;
@@ -4786,12 +4819,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
                  });
         });
     const unsigned AbsoluteDiff = std::abs(*Diff);
-    if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
-                              ((Sz > MinProfitableStridedLoads ||
-                                (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
-                                 has_single_bit(AbsoluteDiff))) &&
-                               AbsoluteDiff > Sz) ||
-                              *Diff == -(static_cast<int>(Sz) - 1))) {
+    if (IsPossibleStrided &&
+        (IsAnyPointerUsedOutGraph ||
+         ((Sz > MinProfitableStridedLoads ||
+           (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
+            hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) &&
+          AbsoluteDiff > Sz) ||
+         *Diff == -(static_cast<int>(Sz) - 1))) {
       int Stride = *Diff / static_cast<int>(Sz - 1);
       if (*Diff == Stride * static_cast<int>(Sz - 1)) {
         Align Alignment =
@@ -5196,7 +5230,7 @@ static bool areTwoInsertFromSameBuildVector(
 std::optional<BoUpSLP::OrdersType>
 BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
   // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
-  if (TE.isNonPowOf2Vec())
+  if (TE.isNonPowOf2Vec(*TTI))
     return std::nullopt;
 
   // No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -5580,7 +5614,7 @@ void BoUpSLP::reorderTopToBottom() {
 
   // Reorder the graph nodes according to their vectorization factor.
   for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
-       VF /= 2) {
+       VF -= 2) {
     auto It = VFToOrderedEntries.find(VF);
     if (It == VFToOrderedEntries.end())
       continue;
@@ -5753,7 +5787,7 @@ bool BoUpSLP::canReorderOperands(
     ArrayRef<TreeEntry *> ReorderableGathers,
     SmallVectorImpl<TreeEntry *> &GatherOps) {
   // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-  if (UserTE->isNonPowOf2Vec())
+  if (UserTE->isNonPowOf2Vec(*TTI))
     return false;
 
   for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
@@ -5928,7 +5962,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
         const auto AllowsReordering = [&](const TreeEntry *TE) {
           // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-          if (TE->isNonPowOf2Vec())
+          if (TE->isNonPowOf2Vec(*TTI))
             return false;
           if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
               (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -6581,7 +6615,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   case Instruction::ExtractElement: {
     bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
     // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
-    if (!has_single_bit(VL.size()))
+    if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size()))
       return TreeEntry::NeedToGather;
     if (Reuse || !CurrentOrder.empty())
       return TreeEntry::Vectorize;
@@ -6987,24 +7021,25 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         UniqueValues.emplace_back(V);
     }
     size_t NumUniqueScalarValues = UniqueValues.size();
-    if (NumUniqueScalarValues == VL.size()) {
+    bool IsFullVectors =
+        hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(),
+                           NumUniqueScalarValues);
+    if (NumUniqueScalarValues == VL.size() &&
+        (VectorizeNonPowerOf2 || IsFullVectors)) {
       ReuseShuffleIndices.clear();
     } else {
       // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
-      if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
+      if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI)) {
         LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
                              "for nodes with padding.\n");
         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
         return false;
       }
       LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
-      if (NumUniqueScalarValues <= 1 ||
-          (UniquePositions.size() == 1 && all_of(UniqueValues,
-                                                 [](Value *V) {
-                                                   return isa<UndefValue>(V) ||
-                                                          !isConstant(V);
-                                                 })) ||
-          !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
+      if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
+          (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
+             return isa<UndefValue>(V) || !isConstant(V);
+           }))) {
         if (DoNotFail && UniquePositions.size() > 1 &&
             NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
             all_of(UniqueValues, [=](Value *V) {
@@ -7012,7 +7047,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                      areAllUsersVectorized(cast<Instruction>(V),
                                            UserIgnoreList);
             })) {
-          unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
+          // Find the number of elements, which forms full vectors.
+          unsigned PWSz = getFullVectorNumberOfElements(
+              *TTI, UniqueValues.front()->getType(), UniqueValues.size());
           if (PWSz == VL.size()) {
             ReuseShuffleIndices.clear();
           } else {
@@ -9107,9 +9144,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       return nullptr;
     Value *VecBase = nullptr;
     ArrayRef<Value *> VL = E->Scalars;
-    // If the resulting type is scalarized, do not adjust the cost.
-    if (NumParts == VL.size())
-      return nullptr;
     // Check if it can be considered reused if same extractelements were
     // vectorized already.
     bool PrevNodeFound = any_of(
@@ -9762,7 +9796,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       InsertMask[Idx] = I + 1;
     }
     unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
-    if (NumOfParts > 0)
+    if (NumOfParts > 0 && NumOfParts < NumElts)
       VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
     unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
                      VecScalarsSz;
@@ -10991,7 +11025,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           // Keep original scalar if number of externally used instructions in
           // the same entry is not power of 2. It may help to do some extra
           // vectorization for now.
-          KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
+          KeepScalar =
+              ScalarUsesCount <= 1 ||
+              !hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount);
         }
         if (KeepScalar) {
           ExternalUsesAsOriginalScalar.insert(EU.Scalar);
@@ -11688,13 +11724,14 @@ BoUpSLP::isGatherShuffledEntry(
   if (TE == VectorizableTree.front().get())
     return {};
   // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
-  if (TE->isNonPowOf2Vec())
+  if (TE->isNonPowOf2Vec(*TTI))
     return {};
   Mask.assign(VL.size(), PoisonMaskElem);
   assert(TE->UserTreeIndices.size() == 1 &&
          "Expected only single user of the gather node.");
-  assert(VL.size() % NumParts == 0 &&
-         "Number of scalars must be divisible by NumParts.");
+  // Number of scalars must be divisible by NumParts.
+  if (VL.size() % NumParts != 0)
+    return {};
   unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
   SmallVector<std::optional<TTI::ShuffleKind>> Res;
   for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -17005,7 +17042,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     for (unsigned I = NextInst; I < MaxInst; ++I) {
       unsigned ActualVF = std::min(MaxInst - I, VF);
 
-      if (!has_single_bit(ActualVF))
+      if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF))
         continue;
 
       if (MaxVFOnly && ActualVF < MaxVF)
diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
index 281b5f99540eab..4074b8654362e0 100644
--- a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
@@ -1,21 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux -mattr=+v -slp-threshold=-100 | FileCheck %s --check-prefix=RISCV
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-100 | FileCheck %s
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -slp-threshold=-100 | FileCheck %s
 ; REQUIRES: aarch64-registered-target, x86-registered-target, riscv-registered-target
 
 define i64 @test(ptr %p) {
+; RISCV-LABEL: @test(
+; RISCV-NEXT:  entry:
+; RISCV-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
+; RISCV-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
+; RISCV-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
+; RISCV-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
+; RISCV-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
+; RISCV-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
+; RISCV-NEXT:    [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
+; RISCV-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
+; RISCV-NEXT:    ret i64 [[TMP6]]
+;
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
-; CHECK-NEXT:    ret i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1

github-actions · 2024-09-04T17:32:43Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Created using spr 1.3.5

preames · 2024-09-05T14:44:44Z

Can you explain how this version of the patch differs (profitability heuristic wise) from the previous?

My style comment from the previous review (#106449) applies here as well.

Created using spr 1.3.5

preames · 2024-09-06T19:32:15Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+        (IsAnyPointerUsedOutGraph ||
+         ((Sz > MinProfitableStridedLoads ||
+           (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
+            hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) &&


I think you're confusing two things here. The has_single_bit check was on the size (and thus stride), not on the element type. I think we can probably just relax this check to allow any stride >= Sz.

Mistakenly replaced, when searched for has_single_bit function use

preames · 2024-09-06T19:33:30Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

  // Reorder the graph nodes according to their vectorization factor.
  for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
-       VF = bit_ceil(VF) / 2) {
+       VF = (VF & ~1U) - 2) {


What is this trying to do? & -1 should just be VF shouldn't it?

If VF is odd (for 3, 7, etc. number of elements), want to try next even. I reworked it to VF += 2 - (VF & 1) to jump over even VFs

preames · 2024-09-06T19:37:10Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

-  assert(VL.size() % NumParts == 0 &&
-         "Number of scalars must be divisible by NumParts.");
+  // Number of scalars must be divisible by NumParts.
+  if (VL.size() % NumParts != 0)


I'm reading over the existing NumParts code, and not sure I'm grasping what it's doing. Is the idea to cost manual splitting of the vector to a smaller legal vector type? If so, why is this not something that TTI should be doing internally?

No, not vector type splitting, but finding subvectors, which can be shuffled

Created using spr 1.3.5

alexey-bataev · 2024-09-12T17:33:16Z

Ping!

preames · 2024-09-13T17:07:57Z

Ping!

My previous question still stands. Can you explain how this version of the patch differs (profitability heuristic wise) from the previous? I'm not asking to be difficult. I'm asking because it's hard to review the change without knowing what your intention and reasoning is.

alexey-bataev · 2024-09-13T17:18:42Z

Ping!

My previous question still stands. Can you explain how this version of the patch differs (profitability heuristic wise) from the previous? I'm not asking to be difficult. I'm asking because it's hard to review the change without knowing what your intention and reasoning is.

It does not add any profitability checks. It just checks for the actual number of parts (registers), used in the codegen for the type, and tries to operate on the whole parts only.

I'm going to make this mode (whole number of registers, but number of elements is non-power-of-2) default for the key instructions, when trying to build/analyze the graph.
When you have non-power-of-2 number of key nodes (say, 7), the compiler first try to vectorize all 7 elements. If it fails, it falls back to power-of-2 elements, i.e. 4. Instead, we may try to vectorize 6 elements, if they operate on 3 full registers. In some cases, it might be more profitable than 4 elements (because of the cross-use between registers)

preames · 2024-09-13T22:09:59Z

I think I understand, let me state this back to you to make sure.

You're extending the non-power-of-two vectorization support with a different class of non-power-of-two vectors which may be profitable. (The previous class was VL=2^N-1.) The class being proposed is VL=VLMAX*N where VLMAX is the number of elements in a single vector register. You're planning to both allow this class as the initial VL, and also adjust the search order to try VLMAX boundaries instead of reducing by powers of two.

This sounds reasonable. Let me go look at the code with this in mind.

preames

(Meta comment)

You have several cases where you are relaxing a power-of-two guard for code only given the new VL=N*VLMAX class where I struggle to believe that if that code is correct, it's not also correct for VL=2^N-1. I strongly request that if you believe a piece of code is correct for a non-power-of-two vector, that you remove the guard in a separate commit which adds a test for that particular path.

Once you've done so, this patch becomes fairly trivial as it's just a new "type" of non-power-of-twos which can flow through.

preames · 2024-09-13T22:14:00Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+    return false;
+  if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
+    return false;
+  if (has_single_bit(Sz))


I don't think there's any requirement that a power of two VL is a full register? Consider SEW=64, VL=2 on RISCV. With zvl256b this is at most half of a register. At zvl128b, it might be a full register (or might not.)

Cust currently SLP supports power-of-2 elements, no matter if it uses full registers or not. This check just checks for what we currently have in the compiler by default, it is not new requirement. I'll rename the function.

preames · 2024-09-13T22:16:22Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

        // scalars after full support of non-power-2 vectorization.
-        return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
+        return UniqueValues.size() != 2 &&
+               hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(),


Instead of white listing only the new class here, can we remove the restriction in a separate patch? This should be fully VL independent right?

Currently no, it will regress some code. I 'll restore an original check here and will update it later

preames · 2024-09-13T22:18:30Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

                          SmallVectorImpl<Value *> *AltScalars = nullptr) const;

    /// Return true if this is a non-power-of-2 node.
-    bool isNonPowOf2Vec() const {


Request: Please leave the original method, and add a new one called isAllowedNonPowOfTwo.

preames · 2024-09-13T22:19:32Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

    // FIXME: Remove once support for ReuseShuffleIndices has been implemented
    // for non-power-of-two vectors.
-    assert((has_single_bit(VL.size()) || ReuseShuffleIndices.empty()) &&
+    assert((hasFullVectorsOnly(*TTI, getValueType(VL.front()), VL.size()) ||


Have you audited the reordering code for VL=N*VLMAX? I suspect not as it's not trivial, and I'd expect it to require changes. If you have, we should be able to just remove this assert instead as it should work for any VL.

This is not reordering, this is reusing of the same scalars. The test checks it (see [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>)

preames · 2024-09-13T22:19:48Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

  if (!TE.ReuseShuffleIndices.empty()) {
    // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
-    assert(!TE.isNonPowOf2Vec() &&
+    assert(!TE.hasNonWholeRegisterElems(*TTI) &&


Same as above.

Again, the test checks this

preames · 2024-09-13T22:20:18Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

    // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
    // has been auditted for correctness with non-power-of-two vectors.
-    if (!TE.isNonPowOf2Vec())
+    if (!TE.hasNonWholeRegisterElems(*TTI))


If you've audited this for arbitrary VL, please just remove the check. (In it's own commit.)

Reordering requires some extra testing, reverting it for now

preames · 2024-09-13T22:21:14Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

    bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
    // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
-    if (!has_single_bit(VL.size()))
+    if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size()))


Again, have you audited the exposed code?

Reverting it for now

alexey-bataev · 2024-09-13T22:37:20Z

(Meta comment)

You have several cases where you are relaxing a power-of-two guard for code only given the new VL=N*VLMAX class where I struggle to believe that if that code is correct, it's not also correct for VL=2^N-1. I strongly request that if you believe a piece of code is correct for a non-power-of-two vector, that you remove the guard in a separate commit which adds a test for that particular path.

Once you've done so, this patch becomes fairly trivial as it's just a new "type" of non-power-of-twos which can flow through.

I'll drop for now most of non-tested stuff, will include just some basic support.

alexey-bataev · 2024-09-13T22:44:59Z

I think I understand, let me state this back to you to make sure.

You're extending the non-power-of-two vectorization support with a different class of non-power-of-two vectors which may be profitable. (The previous class was VL=2^N-1.) The class being proposed is VL=VLMAX*N where VLMAX is the number of elements in a single vector register. You're planning to both allow this class as the initial VL, and also adjust the search order to try VLMAX boundaries instead of reducing by powers of two.

This sounds reasonable. Let me go look at the code with this in mind.

Yes. I plan to use this class mostly for initial VL only (balance compile time and vectorization quality), for other scalars in the graph plan to allow any non-power-of-2 value.

Created using spr 1.3.5

preames · 2024-09-17T18:36:42Z

For the record, LGTM because I don't spot a functional problem with the code, not because I'm convinced this is always going to be profitable w/o cost model updates.

alexey-bataev · 2024-09-17T18:39:04Z

For the record, LGTM because I don't spot a functional problem with the code, not because I'm convinced this is always going to be profitable w/o cost model updates.

It is a conservative solution. Relies completely on the current cost model, which always rounds the number of elements up to power-of-2

Created using spr 1.3.5

nikic · 2024-09-25T20:06:40Z

This causes a crash when linking lencod from llvm-test-suite in ReleaseThinLTO configuration. Reverted for now in 29b92d0.

…mber of elements in operands. Patch adds basic support for non-power-of-2 number of elements in operands. The patch still requires that this number addresses whole registers. Reviewers: RKSimon, preames Reviewed By: preames Pull Request: #107273

mstorsjo · 2024-09-26T07:36:36Z

This still triggers failed asserts for me:

typedef struct {
  int *a[][4]
} b;
void *c;
int d;
void e() {
  b *f = c;
  int g, h = d;
  g = 0;
  for (; g < h; g++)
    f->a[1][2][g] = f->a[1][3][g] = f->a[2][0][g] = f->a[2][1][g] =
        f->a[2][2][g] = f->a[2][3][g] = f->a[3][0][g] = f->a[3][1][g] =
            f->a[3][2][g] = 0;
}

$ clang -target x86_64-linux-gnu -c -O2 repro.c 
clang: /home/martin/code/llvm-project/llvm/include/llvm/ADT/SmallVector.h:291: T& llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::operator[](llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::size_type) [with T = unsigned int; <template-parameter-1-2> = void; llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::reference = unsigned int&; llvm::SmallVectorTemplateCommon<T, <template-parameter-1-2> >::size_type = long unsigned int]: Assertion `idx < size()' failed.

alexey-bataev · 2024-09-26T11:53:33Z

typedef struct {
  int *a[][4]
} b;
void *c;
int d;
void e() {
  b *f = c;
  int g, h = d;
  g = 0;
  for (; g < h; g++)
    f->a[1][2][g] = f->a[1][3][g] = f->a[2][0][g] = f->a[2][1][g] =
        f->a[2][2][g] = f->a[2][3][g] = f->a[3][0][g] = f->a[3][1][g] =
            f->a[3][2][g] = 0;
}

Fixed in 100fd0c

mstorsjo · 2024-09-26T13:02:34Z

Thanks! It seems like my builds pass now after that fix.

fhahn · 2024-09-26T18:14:09Z

Looks like there may be another crash: https://llvm.godbolt.org/z/azGPd145f

alexey-bataev · 2024-09-26T20:20:47Z

Looks like there may be another crash: https://llvm.godbolt.org/z/azGPd145f

Must be fixed in be6aed9

…mber of elements in operands. Patch adds basic support for non-power-of-2 number of elements in operands. The patch still requires that this number addresses whole registers. Reviewers: RKSimon, preames Reviewed By: preames Pull Request: llvm#107273

fhahn · 2024-09-28T20:25:09Z

Looks like there may be another crash: https://llvm.godbolt.org/z/azGPd145f

Must be fixed in be6aed9

Yes thanks!

[𝘀𝗽𝗿] initial version

f927502

Created using spr 1.3.5

llvmbot added vectorizers llvm:transforms labels Sep 4, 2024

alexey-bataev requested a review from RKSimon September 4, 2024 17:28

Fix formatting

a79159b

Created using spr 1.3.5

RKSimon requested a review from preames September 5, 2024 09:38

Rebase, address comments

0a78a26

Created using spr 1.3.5

preames reviewed Sep 6, 2024

View reviewed changes

alexey-bataev added 2 commits September 6, 2024 20:42

Rebase, address comments

b8b1527

Created using spr 1.3.5

Rebase, restored some code, some optimizations

7c68dbf

Created using spr 1.3.5

preames requested changes Sep 13, 2024

View reviewed changes

Rebase, address comments

affdc53

Created using spr 1.3.5

preames approved these changes Sep 17, 2024

View reviewed changes

Rebase

13c2844

Created using spr 1.3.5

alexey-bataev merged commit 6b109a3 into main Sep 25, 2024
3 checks passed

alexey-bataev deleted the users/alexey-bataev/spr/slpinitial-support-for-non-power-of-2-but-still-whole-register-number-of-elements-in-operands-1 branch September 25, 2024 14:43

[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands. #107273

[SLP]Initial support for non-power-of-2 (but still whole register) number of elements in operands. #107273

Uh oh!

Conversation

alexey-bataev commented Sep 4, 2024

Uh oh!

llvmbot commented Sep 4, 2024

Uh oh!

github-actions bot commented Sep 4, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

preames commented Sep 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

alexey-bataev commented Sep 12, 2024

Uh oh!

preames commented Sep 13, 2024

Uh oh!

alexey-bataev commented Sep 13, 2024

Uh oh!

preames commented Sep 13, 2024

Uh oh!

preames left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

alexey-bataev Sep 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

alexey-bataev commented Sep 13, 2024

Uh oh!

alexey-bataev commented Sep 13, 2024

Uh oh!

preames commented Sep 17, 2024

Uh oh!

alexey-bataev commented Sep 17, 2024

Uh oh!

Uh oh!

nikic commented Sep 25, 2024

Uh oh!

mstorsjo commented Sep 26, 2024

Uh oh!

alexey-bataev commented Sep 26, 2024

Uh oh!

github-actions bot commented Sep 4, 2024 •

edited

Loading

preames commented Sep 5, 2024 •

edited

Loading

alexey-bataev Sep 16, 2024 •

edited

Loading