llvm · alexey-bataev · Jan 21, 2025 · Oct 22, 2024 · Nov 22, 2024 · Dec 24, 2024
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2627,7 +2627,9 @@ class BoUpSLP {
         }
         // TODO: Check if we can remove a check for non-power-2 number of
         // scalars after full support of non-power-2 vectorization.
-        return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
+        return UniqueValues.size() != 2 &&
+               hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
+                                        UniqueValues.size());
       };
 
       // If the initial strategy fails for any of the operand indexes, then we
@@ -5045,12 +5047,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                  });
         });
     const unsigned AbsoluteDiff = std::abs(*Diff);
-    if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
-                              ((Sz > MinProfitableStridedLoads ||
-                                (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
-                                 has_single_bit(AbsoluteDiff))) &&
-                               AbsoluteDiff > Sz) ||
-                              *Diff == -(static_cast<int>(Sz) - 1))) {
+    if (IsPossibleStrided &&
+        (IsAnyPointerUsedOutGraph ||
+         ((Sz > MinProfitableStridedLoads ||
+           (AbsoluteDiff <= MaxProfitableLoadStride * Sz && AbsoluteDiff > Sz &&
+            AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz))) &&
+          AbsoluteDiff > Sz) ||
+         *Diff == -(static_cast<int>(Sz) - 1))) {
       int Stride = *Diff / static_cast<int>(Sz - 1);
       if (*Diff == Stride * static_cast<int>(Sz - 1)) {
         Align Alignment =
@@ -5136,17 +5139,21 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
       return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
 
     // FIXME: The following code has not been updated for non-power-of-2
-    // vectors.  The splitting logic here does not cover the original
-    // vector if the vector factor is not a power of two.  FIXME
-    if (!has_single_bit(VL.size()))
+    // vectors (and not whole registers).  The splitting logic here does not
+    // cover the original vector if the vector factor is not a power of two.
+    // FIXME
+    if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
       return false;
 
     unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
     unsigned MinVF = getMinVF(2 * Sz);
     DemandedElts.clearAllBits();
     // Iterate through possible vectorization factors and check if vectorized +
     // shuffles is better than just gather.
-    for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
+    for (unsigned VF =
+             getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
+         VF >= MinVF;
+         VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
       SmallVector<LoadsState> States;
       for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
         ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
@@ -7552,8 +7559,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   case Instruction::ExtractValue:
   case Instruction::ExtractElement: {
     bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
-    // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
-    if (!has_single_bit(VL.size()))
+    // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
+    // non-full registers).
+    if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
       return TreeEntry::NeedToGather;
     if (Reuse || !CurrentOrder.empty())
       return TreeEntry::Vectorize;
@@ -8009,7 +8017,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
       if ((UserTreeIdx.UserTE &&
            UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
-          !has_single_bit(VL.size())) {
+          !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
         LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
                              "for nodes with padding.\n");
         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -9759,7 +9767,8 @@ void BoUpSLP::transformNodes() {
             if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) ||
                 (S.getOpcode() == Instruction::Load &&
                  areKnownNonVectorizableLoads(Slice)) ||
-                (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
+                (S.getOpcode() != Instruction::Load &&
+                 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
               continue;
             if (VF == 2) {
               // Try to vectorize reduced values or if all users are vectorized.
@@ -13369,8 +13378,9 @@ BoUpSLP::isGatherShuffledEntry(
                  return !TE->isGather();
                })))
     return {};
-  // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
-  if (TE->isNonPowOf2Vec())
+  // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
+  // implemented yet.
+  if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
     return {};
   Mask.assign(VL.size(), PoisonMaskElem);
   assert((TE->UserTreeIndices.size() == 1 ||
@@ -18953,7 +18963,9 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
   unsigned Sz = R.getVectorElementSize(I0);
   unsigned MinVF = R.getMinVF(Sz);
-  unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
+  unsigned MaxVF = std::max<unsigned>(
+      getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VL.size()),
+      MinVF);
   MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
   if (MaxVF < 2) {
     R.getORE()->emit([&]() {
@@ -18970,7 +18982,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   Type *ScalarTy = getValueType(VL[0]);
 
   unsigned NextInst = 0, MaxInst = VL.size();
-  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
+  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
+       VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
     // No actual vectorization should happen, if number of parts is the same as
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
@@ -18985,7 +18998,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
       if (MaxVFOnly && ActualVF < MaxVF)
         break;
-      if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
+      if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
         break;
 
       SmallVector<Value *> Ops(ActualVF, nullptr);

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
@@ -1,19 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
 
+; FIXME: need to fix/improve cost estimation of many SK_PermuteTwo vector registers to account
+; correctly for poisoned/identity shuffles and only calculate actual shuffles.
 define void @test(ptr noalias %0, ptr noalias %1) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 24
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP1]], i64 48
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24
+; CHECK-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP7]], align 8
+; CHECK-NEXT:    store double [[TMP8]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP0]], i64 48
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr [[TMP9]], align 16
+; CHECK-NEXT:    store double [[TMP10]], ptr [[TMP6]], align 16
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> <i32 2, i32 4, i32 0, i32 3, i32 5, i32 1>
-; CHECK-NEXT:    store <6 x double> [[TMP13]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[TMP11]], align 8
+; CHECK-NEXT:    store double [[TMP12]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32
+; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[TMP13]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32
+; CHECK-NEXT:    store double [[TMP14]], ptr [[TMP15]], align 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP0]], i64 56
+; CHECK-NEXT:    [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP1]], i64 40
+; CHECK-NEXT:    store double [[TMP17]], ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT:    [[TMP20:%.*]] = load double, ptr [[TMP19]], align 16
+; CHECK-NEXT:    store double [[TMP20]], ptr [[TMP4]], align 16
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
 ; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP1]], i64 56

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll
@@ -8,23 +8,23 @@ define void @test(i32 %arg) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ARG]], i32 0
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP6:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP6:%.*]], %[[BB1]] ]
 ; CHECK-NEXT:    [[PHI3:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[OP_RDX4:%.*]], %[[BB1]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0>
 ; CHECK-NEXT:    [[ADD17:%.*]] = add i32 [[PHI]], 0
-; CHECK-NEXT:    [[ADD18:%.*]] = add i32 [[PHI2]], 0
+; CHECK-NEXT:    [[ADD4:%.*]] = add i32 [[PHI]], 0
 ; CHECK-NEXT:    [[ADD19:%.*]] = add i32 [[PHI2]], 0
-; CHECK-NEXT:    [[ADD23:%.*]] = add i32 [[PHI2]], 0
+; CHECK-NEXT:    [[ADD6:%.*]] = add i32 [[PHI]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4]] = add <2 x i32> [[TMP0]], <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP5]] = extractelement <2 x i32> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[TMP6]] = extractelement <2 x i32> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i32 [[TMP7]], [[ADD18]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = xor i32 [[ADD17]], [[ADD19]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = xor i32 [[ADD23]], [[TMP6]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i32 [[TMP7]], [[ADD17]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = xor i32 [[ADD4]], [[ADD6]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = xor i32 [[ADD19]], [[TMP6]]
 ; CHECK-NEXT:    [[OP_RDX3:%.*]] = xor i32 [[OP_RDX]], [[OP_RDX1]]
 ; CHECK-NEXT:    [[OP_RDX4]] = xor i32 [[OP_RDX3]], [[OP_RDX2]]
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i32 [[TMP5]], 0