@@ -1371,6 +1371,8 @@ class BoUpSLP {
13711371 MustGather.clear();
13721372 NonScheduledFirst.clear();
13731373 EntryToLastInstruction.clear();
1374+ LoadEntriesToVectorize.clear();
1375+ IsGraphTransformMode = false;
13741376 GatheredLoadsEntriesFirst.reset();
13751377 ExternalUses.clear();
13761378 ExternalUsesAsOriginalScalar.clear();
@@ -3613,6 +3615,14 @@ class BoUpSLP {
36133615 DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
36143616 ValueToGatherNodesMap ValueToGatherNodes;
36153617
3618+ /// A list of the load entries (node indices), which can be vectorized using
3619+ /// strided or masked gather approach, but attempted to be represented as
3620+ /// contiguous loads.
3621+ SetVector<unsigned> LoadEntriesToVectorize;
3622+
3623+ /// true if graph nodes transforming mode is on.
3624+ bool IsGraphTransformMode = false;
3625+
36163626 /// The index of the first gathered load entry in the VectorizeTree.
36173627 std::optional<unsigned> GatheredLoadsEntriesFirst;
36183628
@@ -4618,17 +4628,15 @@ static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
46184628 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
46194629 return false;
46204630 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4621- if (!GEP1)
4622- return false;
46234631 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4624- if (!GEP2)
4625- return false;
4626- return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4627- ((isConstant(GEP1->getOperand(1)) &&
4628- isConstant(GEP2->getOperand(1))) ||
4632+ return (!GEP1 || GEP1->getNumOperands() == 2) &&
4633+ (!GEP2 || GEP2->getNumOperands() == 2) &&
4634+ (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4635+ (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
46294636 !CompareOpcodes ||
4630- getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4631- .getOpcode());
4637+ (GEP1 && GEP2 &&
4638+ getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4639+ .getOpcode()));
46324640}
46334641
46344642/// Calculates minimal alignment as a common alignment.
@@ -5118,9 +5126,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
51185126 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
51195127 return L->isLoopInvariant(V);
51205128 })) <= Sz / 2;
5121- if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted ](Value *P) {
5129+ if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
51225130 auto *GEP = dyn_cast<GetElementPtrInst>(P);
5123- return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
5131+ return (!GEP && doesNotNeedToBeScheduled(P)) ||
51245132 (GEP && GEP->getNumOperands() == 2 &&
51255133 isa<Constant, Instruction>(GEP->getOperand(1)));
51265134 })) {
@@ -6667,6 +6675,12 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
66676675 ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
66686676 GatheredLoadsEntriesFirst = VectorizableTree.size();
66696677
6678+ SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6679+ LoadEntriesToVectorize.size());
6680+ for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6681+ Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6682+ VectorizableTree[Idx]->Scalars.end());
6683+
66706684 // Sort loads by distance.
66716685 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
66726686 const std::pair<LoadInst *, int> &L2) {
@@ -6924,8 +6938,42 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69246938 }
69256939 }
69266940 }
6941+ // Cannot represent the loads as consecutive vectorizable nodes -
6942+ // just exit.
6943+ unsigned ConsecutiveNodesSize = 0;
6944+ if (!LoadEntriesToVectorize.empty() &&
6945+ any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
6946+ [&, Slice = Slice](const auto &P) {
6947+ const auto *It = find_if(Slice, [&](Value *V) {
6948+ return std::get<1>(P).contains(V);
6949+ });
6950+ if (It == Slice.end())
6951+ return false;
6952+ ArrayRef<Value *> VL =
6953+ VectorizableTree[std::get<0>(P)]->Scalars;
6954+ ConsecutiveNodesSize += VL.size();
6955+ unsigned Start = std::distance(Slice.begin(), It);
6956+ unsigned Sz = Slice.size() - Start;
6957+ return Sz < VL.size() ||
6958+ Slice.slice(std::distance(Slice.begin(), It),
6959+ VL.size()) != VL;
6960+ }))
6961+ continue;
69276962 // Try to build long masked gather loads.
69286963 UserMaxVF = bit_ceil(UserMaxVF);
6964+ if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
6965+ [&, Slice = Slice](unsigned Idx) {
6966+ OrdersType Order;
6967+ SmallVector<Value *> PointerOps;
6968+ return canVectorizeLoads(
6969+ Slice.slice(Idx * UserMaxVF, UserMaxVF),
6970+ Slice[Idx * UserMaxVF], Order,
6971+ PointerOps) ==
6972+ LoadsState::ScatterVectorize;
6973+ }))
6974+ UserMaxVF = MaxVF;
6975+ if (Slice.size() != ConsecutiveNodesSize)
6976+ MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
69296977 }
69306978 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
69316979 bool IsVectorized = true;
@@ -6934,6 +6982,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69346982 Slice.slice(I, std::min(VF, E - I));
69356983 if (getTreeEntry(SubSlice.front()))
69366984 continue;
6985+ // Check if the subslice is to be-vectorized entry, which is not
6986+ // equal to entry.
6987+ if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
6988+ [&](const auto &P) {
6989+ return !SubSlice.equals(
6990+ VectorizableTree[std::get<0>(P)]
6991+ ->Scalars) &&
6992+ set_is_subset(SubSlice, std::get<1>(P));
6993+ }))
6994+ continue;
69376995 unsigned Sz = VectorizableTree.size();
69386996 buildTree_rec(SubSlice, 0, EdgeInfo());
69396997 if (Sz == VectorizableTree.size()) {
@@ -6968,6 +7026,20 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69687026 // Final attempt to vectorize non-vectorized loads.
69697027 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
69707028 }
7029+ // Try to vectorize postponed load entries, previously marked as gathered.
7030+ for (unsigned Idx : LoadEntriesToVectorize) {
7031+ const TreeEntry &E = *VectorizableTree[Idx];
7032+ SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7033+ // Avoid reordering, if possible.
7034+ if (!E.ReorderIndices.empty()) {
7035+ // Build a mask out of the reorder indices and reorder scalars per this
7036+ // mask.
7037+ SmallVector<int> ReorderMask;
7038+ inversePermutation(E.ReorderIndices, ReorderMask);
7039+ reorderScalars(GatheredScalars, ReorderMask);
7040+ }
7041+ buildTree_rec(GatheredScalars, 0, EdgeInfo());
7042+ }
69717043 // If no new entries created, consider it as no gathered loads entries must be
69727044 // handled.
69737045 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
@@ -7280,6 +7352,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
72807352 case LoadsState::Vectorize:
72817353 return TreeEntry::Vectorize;
72827354 case LoadsState::ScatterVectorize:
7355+ if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7356+ // Delay slow vectorized nodes for better vectorization attempts.
7357+ LoadEntriesToVectorize.insert(VectorizableTree.size());
7358+ return TreeEntry::NeedToGather;
7359+ }
72837360 return TreeEntry::ScatterVectorize;
72847361 case LoadsState::StridedVectorize:
72857362 return TreeEntry::StridedVectorize;
@@ -9117,6 +9194,17 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
91179194void BoUpSLP::transformNodes() {
91189195 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
91199196 BaseGraphSize = VectorizableTree.size();
9197+ // Turn graph transforming mode on and off, when done.
9198+ class GraphTransformModeRAAI {
9199+ bool &SavedIsGraphTransformMode;
9200+
9201+ public:
9202+ GraphTransformModeRAAI(bool &IsGraphTransformMode)
9203+ : SavedIsGraphTransformMode(IsGraphTransformMode) {
9204+ IsGraphTransformMode = true;
9205+ }
9206+ ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9207+ } TransformContext(IsGraphTransformMode);
91209208 // Operands are profitable if they are:
91219209 // 1. At least one constant
91229210 // or
@@ -9149,7 +9237,7 @@ void BoUpSLP::transformNodes() {
91499237 unsigned MinVF = getMinVF(2 * Sz);
91509238 // Do not try partial vectorization for small nodes (<= 2), nodes with the
91519239 // same opcode and same parent block or all constants.
9152- if (VL.size() <= 2 ||
9240+ if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
91539241 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
91549242 E.isAltShuffle() || !allSameBlock(VL)) ||
91559243 allConstant(VL) || isSplat(VL))
@@ -9248,13 +9336,17 @@ void BoUpSLP::transformNodes() {
92489336 continue;
92499337 }
92509338 unsigned PrevSize = VectorizableTree.size();
9339+ [[maybe_unused]] unsigned PrevEntriesSize =
9340+ LoadEntriesToVectorize.size();
92519341 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
92529342 if (PrevSize + 1 == VectorizableTree.size() &&
92539343 VectorizableTree[PrevSize]->isGather() &&
92549344 VectorizableTree[PrevSize]->getOpcode() !=
92559345 Instruction::ExtractElement &&
92569346 !isSplat(Slice)) {
92579347 VectorizableTree.pop_back();
9348+ assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9349+ "LoadEntriesToVectorize expected to remain the same");
92589350 continue;
92599351 }
92609352 AddCombinedNode(PrevSize, Cnt);
@@ -9340,17 +9432,19 @@ void BoUpSLP::transformNodes() {
93409432 }
93419433 }
93429434
9343- // Single load node - exit.
9344- if (VectorizableTree.size() <= 1 &&
9345- VectorizableTree.front()->getOpcode() == Instruction::Load)
9346- return;
9347- // Small graph with small VF - exit.
9348- constexpr unsigned SmallTree = 3;
9349- constexpr unsigned SmallVF = 2;
9350- if ((VectorizableTree.size() <= SmallTree &&
9351- VectorizableTree.front()->Scalars.size() == SmallVF) ||
9352- (VectorizableTree.size() <= 2 && UserIgnoreList))
9353- return;
9435+ if (LoadEntriesToVectorize.empty()) {
9436+ // Single load node - exit.
9437+ if (VectorizableTree.size() <= 1 &&
9438+ VectorizableTree.front()->getOpcode() == Instruction::Load)
9439+ return;
9440+ // Small graph with small VF - exit.
9441+ constexpr unsigned SmallTree = 3;
9442+ constexpr unsigned SmallVF = 2;
9443+ if ((VectorizableTree.size() <= SmallTree &&
9444+ VectorizableTree.front()->Scalars.size() == SmallVF) ||
9445+ (VectorizableTree.size() <= 2 && UserIgnoreList))
9446+ return;
9447+ }
93549448
93559449 // A list of loads to be gathered during the vectorization process. We can
93569450 // try to vectorize them at the end, if profitable.
0 commit comments