@@ -2656,7 +2656,9 @@ class BoUpSLP {
26562656 }
26572657 // TODO: Check if we can remove a check for non-power-2 number of
26582658 // scalars after full support of non-power-2 vectorization.
2659- return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2659+ return UniqueValues.size() != 2 &&
2660+ hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
2661+ UniqueValues.size());
26602662 };
26612663
26622664 // If the initial strategy fails for any of the operand indexes, then we
@@ -5101,12 +5103,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
51015103 });
51025104 });
51035105 const unsigned AbsoluteDiff = std::abs(*Diff);
5104- if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5105- ((Sz > MinProfitableStridedLoads ||
5106- (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5107- has_single_bit(AbsoluteDiff))) &&
5108- AbsoluteDiff > Sz) ||
5109- *Diff == -(static_cast<int>(Sz) - 1))) {
5106+ if (IsPossibleStrided &&
5107+ (IsAnyPointerUsedOutGraph ||
5108+ (AbsoluteDiff > Sz &&
5109+ (Sz > MinProfitableStridedLoads ||
5110+ (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5111+ AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
5112+ *Diff == -(static_cast<int>(Sz) - 1))) {
51105113 int Stride = *Diff / static_cast<int>(Sz - 1);
51115114 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
51125115 Align Alignment =
@@ -5192,17 +5195,20 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
51925195 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
51935196
51945197 // FIXME: The following code has not been updated for non-power-of-2
5195- // vectors. The splitting logic here does not cover the original
5196- // vector if the vector factor is not a power of two. FIXME
5197- if (!has_single_bit( VL.size()))
5198+ // vectors (and not whole registers) . The splitting logic here does not
5199+ // cover the original vector if the vector factor is not a power of two.
5200+ if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
51985201 return false;
51995202
52005203 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
52015204 unsigned MinVF = getMinVF(2 * Sz);
52025205 DemandedElts.clearAllBits();
52035206 // Iterate through possible vectorization factors and check if vectorized +
52045207 // shuffles is better than just gather.
5205- for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
5208+ for (unsigned VF =
5209+ getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
5210+ VF >= MinVF;
5211+ VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
52065212 SmallVector<LoadsState> States;
52075213 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
52085214 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
@@ -7632,8 +7638,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
76327638 case Instruction::ExtractValue:
76337639 case Instruction::ExtractElement: {
76347640 bool Reuse = canReuseExtract(VL, CurrentOrder);
7635- // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
7636- if (!has_single_bit(VL.size()))
7641+ // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
7642+ // non-full registers).
7643+ if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
76377644 return TreeEntry::NeedToGather;
76387645 if (Reuse || !CurrentOrder.empty())
76397646 return TreeEntry::Vectorize;
@@ -8089,7 +8096,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
80898096 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
80908097 if ((UserTreeIdx.UserTE &&
80918098 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8092- !has_single_bit( VL.size())) {
8099+ !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
80938100 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
80948101 "for nodes with padding.\n");
80958102 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -9840,7 +9847,8 @@ void BoUpSLP::transformNodes() {
98409847 if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
98419848 (S.getOpcode() == Instruction::Load &&
98429849 areKnownNonVectorizableLoads(Slice)) ||
9843- (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
9850+ (S.getOpcode() != Instruction::Load &&
9851+ !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
98449852 continue;
98459853 if (VF == 2) {
98469854 // Try to vectorize reduced values or if all users are vectorized.
@@ -13618,8 +13626,9 @@ BoUpSLP::isGatherShuffledEntry(
1361813626 return !TE->isGather();
1361913627 })))
1362013628 return {};
13621- // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
13622- if (TE->isNonPowOf2Vec())
13629+ // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
13630+ // implemented yet.
13631+ if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
1362313632 return {};
1362413633 Mask.assign(VL.size(), PoisonMaskElem);
1362513634 assert((TE->UserTreeIndices.size() == 1 ||
@@ -19200,9 +19209,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1920019209 }
1920119210 }
1920219211
19212+ Type *ScalarTy = getValueType(VL[0]);
1920319213 unsigned Sz = R.getVectorElementSize(I0);
1920419214 unsigned MinVF = R.getMinVF(Sz);
19205- unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
19215+ unsigned MaxVF = std::max<unsigned>(
19216+ getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
1920619217 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
1920719218 if (MaxVF < 2) {
1920819219 R.getORE()->emit([&]() {
@@ -19216,10 +19227,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1921619227 bool Changed = false;
1921719228 bool CandidateFound = false;
1921819229 InstructionCost MinCost = SLPCostThreshold.getValue();
19219- Type *ScalarTy = getValueType(VL[0]);
1922019230
1922119231 unsigned NextInst = 0, MaxInst = VL.size();
19222- for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
19232+ for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19233+ VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
1922319234 // No actual vectorization should happen, if number of parts is the same as
1922419235 // provided vectorization factor (i.e. the scalar type is used for vector
1922519236 // code during codegen).
@@ -19234,7 +19245,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1923419245
1923519246 if (MaxVFOnly && ActualVF < MaxVF)
1923619247 break;
19237- if ((VF > MinVF && ActualVF <= VF / 2 ) || (VF == MinVF && ActualVF < 2))
19248+ if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
1923819249 break;
1923919250
1924019251 SmallVector<Value *> Ops(ActualVF, nullptr);
0 commit comments