@@ -2922,7 +2922,7 @@ class BoUpSLP {
29222922
29232923 /// This is the recursive part of buildTree.
29242924 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2925- const EdgeInfo &EI);
2925+ const EdgeInfo &EI, unsigned InterleaveFactor = 0 );
29262926
29272927 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
29282928 /// be vectorized to use the original vector (or aggregate "bitcast" to a
@@ -3226,7 +3226,15 @@ class BoUpSLP {
32263226 Instruction *MainOp = nullptr;
32273227 Instruction *AltOp = nullptr;
32283228
3229+ /// Interleaving factor for interleaved loads Vectorize nodes.
3230+ unsigned InterleaveFactor = 0;
3231+
32293232 public:
3233+ /// Returns interleave factor for interleave nodes.
3234+ unsigned getInterleaveFactor() const { return InterleaveFactor; }
3235+ /// Sets interleaving factor for the interleaving nodes.
3236+ void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3237+
32303238 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
32313239 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
32323240 if (Operands.size() < OpIdx + 1)
@@ -3390,7 +3398,12 @@ class BoUpSLP {
33903398 dbgs() << "State: ";
33913399 switch (State) {
33923400 case Vectorize:
3393- dbgs() << "Vectorize\n";
3401+ if (InterleaveFactor > 0) {
3402+ dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3403+ << "\n";
3404+ } else {
3405+ dbgs() << "Vectorize\n";
3406+ }
33943407 break;
33953408 case ScatterVectorize:
33963409 dbgs() << "ScatterVectorize\n";
@@ -3460,11 +3473,15 @@ class BoUpSLP {
34603473 const InstructionsState &S,
34613474 const EdgeInfo &UserTreeIdx,
34623475 ArrayRef<int> ReuseShuffleIndices = {},
3463- ArrayRef<unsigned> ReorderIndices = {}) {
3476+ ArrayRef<unsigned> ReorderIndices = {},
3477+ unsigned InterleaveFactor = 0) {
34643478 TreeEntry::EntryState EntryState =
34653479 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3466- return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3467- ReuseShuffleIndices, ReorderIndices);
3480+ TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3481+ ReuseShuffleIndices, ReorderIndices);
3482+ if (E && InterleaveFactor > 0)
3483+ E->setInterleave(InterleaveFactor);
3484+ return E;
34683485 }
34693486
34703487 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
@@ -6849,7 +6866,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
68496866 return Results;
68506867 };
68516868 auto ProcessGatheredLoads =
6852- [&](ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,
6869+ [&, &TTI = *TTI](
6870+ ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,
68536871 bool Final = false) {
68546872 SmallVector<LoadInst *> NonVectorized;
68556873 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
@@ -6932,11 +6950,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69326950 // distance between scalar loads in these nodes.
69336951 unsigned MaxVF = Slice.size();
69346952 unsigned UserMaxVF = 0;
6953+ unsigned InterleaveFactor = 0;
69356954 if (MaxVF == 2) {
69366955 UserMaxVF = MaxVF;
69376956 } else {
6957+ // Found distance between segments of the interleaved loads.
6958+ std::optional<unsigned> InterleavedLoadsDistance = 0;
6959+ unsigned Order = 0;
69386960 std::optional<unsigned> CommonVF = 0;
69396961 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
6962+ SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
69406963 for (auto [Idx, V] : enumerate(Slice)) {
69416964 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
69426965 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
@@ -6951,12 +6974,59 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69516974 if (*CommonVF != E->Scalars.size())
69526975 CommonVF.reset();
69536976 }
6977+ // Check if the load is the part of the interleaved load.
6978+ if (Pos != Idx && InterleavedLoadsDistance) {
6979+ if (!DeinterleavedNodes.contains(E) &&
6980+ any_of(E->Scalars, [&, Slice = Slice](Value *V) {
6981+ if (isa<Constant>(V))
6982+ return false;
6983+ if (getTreeEntry(V))
6984+ return true;
6985+ const auto &Nodes = ValueToGatherNodes.at(V);
6986+ return (Nodes.size() != 1 || !Nodes.contains(E)) &&
6987+ !is_contained(Slice, V);
6988+ })) {
6989+ InterleavedLoadsDistance.reset();
6990+ continue;
6991+ }
6992+ DeinterleavedNodes.insert(E);
6993+ if (*InterleavedLoadsDistance == 0) {
6994+ InterleavedLoadsDistance = Idx - Pos;
6995+ continue;
6996+ }
6997+ if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
6998+ (Idx - Pos) / *InterleavedLoadsDistance < Order)
6999+ InterleavedLoadsDistance.reset();
7000+ Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7001+ }
7002+ }
7003+ }
7004+ DeinterleavedNodes.clear();
7005+ // Check if the large load represents interleaved load operation.
7006+ if (InterleavedLoadsDistance.value_or(0) > 1 &&
7007+ CommonVF.value_or(0) != 0) {
7008+ InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7009+ unsigned VF = *CommonVF;
7010+ OrdersType Order;
7011+ SmallVector<Value *> PointerOps;
7012+ // Segmented load detected - vectorize at maximum vector factor.
7013+ if (TTI.isLegalInterleavedAccessType(
7014+ getWidenedType(Slice.front()->getType(), VF),
7015+ InterleaveFactor,
7016+ cast<LoadInst>(Slice.front())->getAlign(),
7017+ cast<LoadInst>(Slice.front())
7018+ ->getPointerAddressSpace()) &&
7019+ canVectorizeLoads(Slice, Slice.front(), Order,
7020+ PointerOps) == LoadsState::Vectorize) {
7021+ UserMaxVF = InterleaveFactor * VF;
7022+ } else {
7023+ InterleaveFactor = 0;
69547024 }
69557025 }
69567026 // Cannot represent the loads as consecutive vectorizable nodes -
69577027 // just exit.
69587028 unsigned ConsecutiveNodesSize = 0;
6959- if (!LoadEntriesToVectorize.empty() &&
7029+ if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
69607030 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
69617031 [&, Slice = Slice](const auto &P) {
69627032 const auto *It = find_if(Slice, [&](Value *V) {
@@ -6976,7 +7046,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69767046 continue;
69777047 // Try to build long masked gather loads.
69787048 UserMaxVF = bit_ceil(UserMaxVF);
6979- if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7049+ if (InterleaveFactor == 0 &&
7050+ any_of(seq<unsigned>(Slice.size() / UserMaxVF),
69807051 [&, Slice = Slice](unsigned Idx) {
69817052 OrdersType Order;
69827053 SmallVector<Value *> PointerOps;
@@ -7008,9 +7079,15 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
70087079 }))
70097080 continue;
70107081 unsigned Sz = VectorizableTree.size();
7011- buildTree_rec(SubSlice, 0, EdgeInfo());
7082+ buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor );
70127083 if (Sz == VectorizableTree.size()) {
70137084 IsVectorized = false;
7085+ // Try non-interleaved vectorization with smaller vector
7086+ // factor.
7087+ if (InterleaveFactor > 0) {
7088+ VF = 2 * (MaxVF / InterleaveFactor);
7089+ InterleaveFactor = 0;
7090+ }
70147091 continue;
70157092 }
70167093 }
@@ -7374,6 +7451,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
73747451 }
73757452 return TreeEntry::ScatterVectorize;
73767453 case LoadsState::StridedVectorize:
7454+ if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7455+ // Delay slow vectorized nodes for better vectorization attempts.
7456+ LoadEntriesToVectorize.insert(VectorizableTree.size());
7457+ return TreeEntry::NeedToGather;
7458+ }
73777459 return TreeEntry::StridedVectorize;
73787460 case LoadsState::Gather:
73797461#ifndef NDEBUG
@@ -7707,7 +7789,8 @@ class PHIHandler {
77077789} // namespace
77087790
77097791void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
7710- const EdgeInfo &UserTreeIdx) {
7792+ const EdgeInfo &UserTreeIdx,
7793+ unsigned InterleaveFactor) {
77117794 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
77127795
77137796 SmallVector<int> ReuseShuffleIndices;
@@ -8185,7 +8268,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
81858268 switch (State) {
81868269 case TreeEntry::Vectorize:
81878270 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8188- ReuseShuffleIndices, CurrentOrder);
8271+ ReuseShuffleIndices, CurrentOrder, InterleaveFactor );
81898272 if (CurrentOrder.empty())
81908273 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
81918274 else
@@ -9895,6 +9978,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
98959978 Idx = EMask[Idx];
98969979 }
98979980 CommonVF = E->Scalars.size();
9981+ } else if (std::optional<unsigned> Factor = E->getInterleaveFactor();
9982+ Factor && E->Scalars.size() != Mask.size() &&
9983+ ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
9984+ *Factor)) {
9985+ // Deinterleaved nodes are free.
9986+ std::iota(CommonMask.begin(), CommonMask.end(), 0);
98989987 }
98999988 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
99009989 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
@@ -10968,23 +11057,38 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1096811057 auto *LI0 = cast<LoadInst>(VL0);
1096911058 auto GetVectorCost = [&](InstructionCost CommonCost) {
1097011059 InstructionCost VecLdCost;
10971- if (E->State == TreeEntry::Vectorize) {
10972- VecLdCost = TTI->getMemoryOpCost(
10973- Instruction::Load, VecTy, LI0->getAlign(),
10974- LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
10975- } else if (E->State == TreeEntry::StridedVectorize) {
11060+ switch (E->State) {
11061+ case TreeEntry::Vectorize:
11062+ if (unsigned Factor = E->getInterleaveFactor()) {
11063+ VecLdCost = TTI->getInterleavedMemoryOpCost(
11064+ Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11065+ LI0->getPointerAddressSpace(), CostKind);
11066+
11067+ } else {
11068+ VecLdCost = TTI->getMemoryOpCost(
11069+ Instruction::Load, VecTy, LI0->getAlign(),
11070+ LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11071+ }
11072+ break;
11073+ case TreeEntry::StridedVectorize: {
1097611074 Align CommonAlignment =
1097711075 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
1097811076 VecLdCost = TTI->getStridedMemoryOpCost(
1097911077 Instruction::Load, VecTy, LI0->getPointerOperand(),
1098011078 /*VariableMask=*/false, CommonAlignment, CostKind);
10981- } else {
10982- assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
11079+ break;
11080+ }
11081+ case TreeEntry::ScatterVectorize: {
1098311082 Align CommonAlignment =
1098411083 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
1098511084 VecLdCost = TTI->getGatherScatterOpCost(
1098611085 Instruction::Load, VecTy, LI0->getPointerOperand(),
1098711086 /*VariableMask=*/false, CommonAlignment, CostKind);
11087+ break;
11088+ }
11089+ case TreeEntry::CombinedVectorize:
11090+ case TreeEntry::NeedToGather:
11091+ llvm_unreachable("Unexpected vectorization state.");
1098811092 }
1098911093 return VecLdCost + CommonCost;
1099011094 };
@@ -11397,6 +11501,11 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
1139711501 }))
1139811502 return false;
1139911503
11504+ if (VectorizableTree.back()->isGather() &&
11505+ VectorizableTree.back()->isAltShuffle() &&
11506+ VectorizableTree.back()->getVectorFactor() > 2)
11507+ return false;
11508+
1140011509 assert(VectorizableTree.empty()
1140111510 ? ExternalUses.empty()
1140211511 : true && "We shouldn't have any external users");
0 commit comments