@@ -278,6 +278,22 @@ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
278278 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
279279}
280280
281+ /// Returns the number of elements of the given type \p Ty, not greater than \p
282+ /// Sz, which forms type, which splits by \p TTI into whole vector types during
283+ /// legalization.
284+ static unsigned
285+ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
286+ unsigned Sz) {
287+ if (!isValidElementType(Ty))
288+ return bit_floor(Sz);
289+ // Find the number of elements, which forms full vectors.
290+ unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
291+ if (NumParts == 0 || NumParts >= Sz)
292+ return bit_floor(Sz);
293+ unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
294+ return (Sz / RegVF) * RegVF;
295+ }
296+
281297static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
282298 SmallVectorImpl<int> &Mask) {
283299 // The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -7716,7 +7732,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
77167732 }
77177733 size_t NumUniqueScalarValues = UniqueValues.size();
77187734 bool IsFullVectors = hasFullVectorsOrPowerOf2(
7719- *TTI, UniqueValues.front()->getType( ), NumUniqueScalarValues);
7735+ *TTI, getValueType( UniqueValues.front()), NumUniqueScalarValues);
77207736 if (NumUniqueScalarValues == VL.size() &&
77217737 (VectorizeNonPowerOf2 || IsFullVectors)) {
77227738 ReuseShuffleIndices.clear();
@@ -17466,7 +17482,11 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1746617482 const unsigned Sz = R.getVectorElementSize(Chain[0]);
1746717483 unsigned VF = Chain.size();
1746817484
17469- if (!has_single_bit(Sz) || !has_single_bit(VF) || VF < 2 || VF < MinVF) {
17485+ if (!has_single_bit(Sz) ||
17486+ !hasFullVectorsOrPowerOf2(
17487+ *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
17488+ VF) ||
17489+ VF < 2 || VF < MinVF) {
1747017490 // Check if vectorizing with a non-power-of-2 VF should be considered. At
1747117491 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
1747217492 // all vector lanes are used.
@@ -17484,10 +17504,12 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1748417504 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
1748517505 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
1748617506 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
17487- bool IsPowerOf2 =
17488- has_single_bit(ValOps.size()) ||
17507+ bool IsAllowedSize =
17508+ hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
17509+ ValOps.size()) ||
1748917510 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
17490- if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
17511+ if ((!IsAllowedSize && S.getOpcode() &&
17512+ S.getOpcode() != Instruction::Load &&
1749117513 (!S.MainOp->isSafeToRemove() ||
1749217514 any_of(ValOps.getArrayRef(),
1749317515 [&](Value *V) {
@@ -17498,7 +17520,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1749817520 }));
1749917521 }))) ||
1750017522 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
17501- Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
17523+ Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2;
1750217524 return false;
1750317525 }
1750417526 }
@@ -17626,15 +17648,11 @@ bool SLPVectorizerPass::vectorizeStores(
1762617648
1762717649 unsigned MaxVF =
1762817650 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
17629- unsigned MaxRegVF = MaxVF;
1763017651 auto *Store = cast<StoreInst>(Operands[0]);
1763117652 Type *StoreTy = Store->getValueOperand()->getType();
1763217653 Type *ValueTy = StoreTy;
1763317654 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
1763417655 ValueTy = Trunc->getSrcTy();
17635- if (ValueTy == StoreTy &&
17636- R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
17637- MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
1763817656 unsigned MinVF = std::max<unsigned>(
1763917657 2, PowerOf2Ceil(TTI->getStoreMinimumVF(
1764017658 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
@@ -17652,10 +17670,21 @@ bool SLPVectorizerPass::vectorizeStores(
1765217670 // First try vectorizing with a non-power-of-2 VF. At the moment, only
1765317671 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
1765417672 // lanes are used.
17655- unsigned CandVF =
17656- std::clamp<unsigned>(Operands.size(), MaxVF, MaxRegVF);
17657- if (has_single_bit(CandVF + 1))
17673+ unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
17674+ if (has_single_bit(CandVF + 1)) {
1765817675 NonPowerOf2VF = CandVF;
17676+ assert(NonPowerOf2VF != MaxVF &&
17677+ "Non-power-of-2 VF should not be equal to MaxVF");
17678+ }
17679+ }
17680+
17681+ unsigned MaxRegVF = MaxVF;
17682+ MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
17683+ if (MaxVF < MinVF) {
17684+ LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
17685+ << ") < "
17686+ << "MinVF (" << MinVF << ")\n");
17687+ continue;
1765917688 }
1766017689
1766117690 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
@@ -17810,7 +17839,7 @@ bool SLPVectorizerPass::vectorizeStores(
1781017839 std::bind(IsNotVectorized, Size >= MaxRegVF,
1781117840 std::placeholders::_1)));
1781217841 }
17813- if (!AnyProfitableGraph && Size >= MaxRegVF)
17842+ if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size) )
1781417843 break;
1781517844 }
1781617845 // All values vectorized - exit.
@@ -17823,16 +17852,21 @@ bool SLPVectorizerPass::vectorizeStores(
1782317852 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
1782417853 break;
1782517854 constexpr unsigned StoresLimit = 64;
17826- const unsigned MaxTotalNum = bit_floor( std::min<unsigned>(
17855+ const unsigned MaxTotalNum = std::min<unsigned>(
1782717856 Operands.size(),
1782817857 static_cast<unsigned>(
1782917858 End -
1783017859 std::distance(
1783117860 RangeSizes.begin(),
1783217861 find_if(RangeSizes, std::bind(IsNotVectorized, true,
1783317862 std::placeholders::_1))) +
17834- 1)));
17835- unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
17863+ 1));
17864+ unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
17865+ unsigned Limit =
17866+ getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
17867+ CandidateVFs.clear();
17868+ if (bit_floor(Limit) == VF)
17869+ CandidateVFs.push_back(Limit);
1783617870 if (VF > MaxTotalNum || VF >= StoresLimit)
1783717871 break;
1783817872 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
@@ -17841,7 +17875,6 @@ bool SLPVectorizerPass::vectorizeStores(
1784117875 });
1784217876 // Last attempt to vectorize max number of elements, if all previous
1784317877 // attempts were unsuccessful because of the cost issues.
17844- CandidateVFs.clear();
1784517878 CandidateVFs.push_back(VF);
1784617879 }
1784717880 }
0 commit comments