@@ -1347,6 +1347,7 @@ class BoUpSLP {
13471347 }
13481348 MinBWs.clear();
13491349 ReductionBitWidth = 0;
1350+ BaseGraphSize = 1;
13501351 CastMaxMinBWSizes.reset();
13511352 ExtraBitWidthNodes.clear();
13521353 InstrElementSize.clear();
@@ -1355,11 +1356,10 @@ class BoUpSLP {
13551356 ValueToGatherNodes.clear();
13561357 }
13571358
1358- unsigned getTreeSize() const {
1359- return GatheredLoadsEntriesFirst == NoGatheredLoads
1360- ? VectorizableTree.size()
1361- : GatheredLoadsEntriesFirst;
1362- }
1359+ unsigned getTreeSize() const { return VectorizableTree.size(); }
1360+
1361+ /// Returns the base graph size, before any transformations.
1362+ unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
13631363
13641364 /// Perform LICM and CSE on the newly generated gather sequences.
13651365 void optimizeGatherSequence();
@@ -4191,6 +4191,9 @@ class BoUpSLP {
41914191 /// reduction.
41924192 unsigned ReductionBitWidth = 0;
41934193
4194+ /// Canonical graph size before the transformations.
4195+ unsigned BaseGraphSize = 1;
4196+
41944197 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
41954198 /// type sizes, used in the tree.
41964199 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
@@ -9001,47 +9004,147 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
90019004
90029005void BoUpSLP::transformNodes() {
90039006 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9007+ BaseGraphSize = VectorizableTree.size();
9008+ // Operands are profitable if they are:
9009+ // 1. At least one constant
9010+ // or
9011+ // 2. Splats
9012+ // or
9013+ // 3. Results in good vectorization opportunity, i.e. may generate vector
9014+ // nodes and reduce cost of the graph.
9015+ auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9016+ const InstructionsState &S) {
9017+ SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
9018+ for (unsigned Op : seq<unsigned>(S.MainOp->getNumOperands()))
9019+ Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9020+ I2->getOperand(Op));
9021+ return all_of(
9022+ Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9023+ return all_of(Cand,
9024+ [](const std::pair<Value *, Value *> &P) {
9025+ return isa<Constant>(P.first) ||
9026+ isa<Constant>(P.second) || P.first == P.second;
9027+ }) ||
9028+ findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
9029+ });
9030+ };
90049031 // The tree may grow here, so iterate over nodes, built before.
9005- for (unsigned Idx : seq<unsigned>(VectorizableTree.size() )) {
9032+ for (unsigned Idx : seq<unsigned>(BaseGraphSize )) {
90069033 TreeEntry &E = *VectorizableTree[Idx];
90079034 if (E.isGather()) {
90089035 ArrayRef<Value *> VL = E.Scalars;
90099036 const unsigned Sz = getVectorElementSize(VL.front());
90109037 unsigned MinVF = getMinVF(2 * Sz);
9038+ // Do not try partial vectorization for small nodes (<= 2), nodes with the
9039+ // same opcode and same parent block or all constants.
90119040 if (VL.size() <= 2 ||
9012- (E.getOpcode() &&
9013- (E.isAltShuffle() || E.getOpcode() != Instruction::Load)))
9041+ !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9042+ E.isAltShuffle() || !allSameBlock(VL)) ||
9043+ allConstant(VL) || isSplat(VL))
90149044 continue;
90159045 // Try to find vectorizable sequences and transform them into a series of
90169046 // insertvector instructions.
90179047 unsigned StartIdx = 0;
90189048 unsigned End = VL.size();
9019- for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
9049+ for (unsigned VF = VL.size() / 2; VF >= MinVF; VF = bit_ceil(VF) / 2) {
9050+ SmallVector<unsigned> Slices;
90209051 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
90219052 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
90229053 // If any instruction is vectorized already - do not try again.
9023- if (getTreeEntry(Slice.front()) || getTreeEntry(Slice.back()))
9054+ // Reuse the existing node, if it fully matches the slice.
9055+ if (const TreeEntry *SE = getTreeEntry(Slice.front());
9056+ SE || getTreeEntry(Slice.back())) {
9057+ if (!SE)
9058+ continue;
9059+ if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9060+ continue;
9061+ }
9062+ // Constant already handled effectively - skip.
9063+ if (allConstant(Slice))
90249064 continue;
9025- InstructionsState S = getSameOpcode(Slice, *TLI);
9026- if (!S.getOpcode() || S.isAltShuffle() ||
9027- (S.getOpcode() != Instruction::Load &&
9028- any_of(Slice, [&](Value *V) {
9029- return !areAllUsersVectorized(cast<Instruction>(V),
9030- UserIgnoreList);
9031- })))
9065+ // Do not try to vectorize small splats (less than vector register and
9066+ // only with the single non-undef element).
9067+ bool IsSplat = isSplat(Slice);
9068+ if (Slices.empty() || !IsSplat ||
9069+ (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9070+ Slice.front()->getType(), VF)),
9071+ 1U, VF - 1) !=
9072+ std::clamp(TTI->getNumberOfParts(getWidenedType(
9073+ Slice.front()->getType(), 2 * VF)),
9074+ 1U, 2 * VF)) ||
9075+ count(Slice, Slice.front()) ==
9076+ (isa<UndefValue>(Slice.front()) ? VF - 1 : 1)) {
9077+ if (IsSplat)
9078+ continue;
9079+ InstructionsState S = getSameOpcode(Slice, *TLI);
9080+ if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice))
9081+ continue;
9082+ if (VF == 2) {
9083+ // Try to vectorize reduced values or if all users are vectorized.
9084+ // For expensive instructions extra extracts might be profitable.
9085+ if ((!UserIgnoreList || E.Idx != 0) &&
9086+ TTI->getInstructionCost(cast<Instruction>(Slice.front()),
9087+ CostKind) < TTI::TCC_Expensive &&
9088+ !all_of(Slice, [&](Value *V) {
9089+ return areAllUsersVectorized(cast<Instruction>(V),
9090+ UserIgnoreList);
9091+ }))
9092+ continue;
9093+ if (S.getOpcode() == Instruction::Load) {
9094+ OrdersType Order;
9095+ SmallVector<Value *> PointerOps;
9096+ LoadsState Res =
9097+ canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9098+ // Do not vectorize gathers.
9099+ if (Res == LoadsState::ScatterVectorize ||
9100+ Res == LoadsState::Gather)
9101+ continue;
9102+ } else if (S.getOpcode() == Instruction::ExtractElement ||
9103+ (TTI->getInstructionCost(
9104+ cast<Instruction>(Slice.front()), CostKind) <
9105+ TTI::TCC_Expensive &&
9106+ !CheckOperandsProfitability(
9107+ cast<Instruction>(Slice.front()),
9108+ cast<Instruction>(Slice.back()), S))) {
9109+ // Do not vectorize extractelements (handled effectively
9110+ // alread). Do not vectorize non-profitable instructions (with
9111+ // low cost and non-vectorizable operands.)
9112+ continue;
9113+ }
9114+ }
9115+ }
9116+ Slices.emplace_back(Cnt);
9117+ }
9118+ auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt) {
9119+ E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9120+ if (StartIdx == Cnt)
9121+ StartIdx = Cnt + VF;
9122+ if (End == Cnt + VF)
9123+ End = Cnt;
9124+ };
9125+ for (unsigned Cnt : Slices) {
9126+ ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9127+ // If any instruction is vectorized already - do not try again.
9128+ if (const TreeEntry *SE = getTreeEntry(Slice.front());
9129+ SE || getTreeEntry(Slice.back())) {
9130+ if (!SE)
9131+ continue;
9132+ if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9133+ continue;
9134+ AddCombinedNode(SE->Idx, Cnt);
90329135 continue;
9136+ }
90339137 unsigned PrevSize = VectorizableTree.size();
90349138 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
90359139 if (PrevSize + 1 == VectorizableTree.size() &&
9036- VectorizableTree[PrevSize]->isGather()) {
9140+ VectorizableTree[PrevSize]->isGather() &&
9141+ VectorizableTree[PrevSize]->getOpcode() !=
9142+ Instruction::ExtractElement &&
9143+ !isSplat(Slice)) {
90379144 VectorizableTree.pop_back();
90389145 continue;
90399146 }
9040- E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt);
9041- if (StartIdx == Cnt)
9042- StartIdx = Cnt + VF;
9043- if (End == Cnt + VF)
9044- End = Cnt;
9147+ AddCombinedNode(PrevSize, Cnt);
90459148 }
90469149 }
90479150 }
@@ -12293,6 +12396,14 @@ BoUpSLP::isGatherShuffledEntry(
1229312396 "Expected only single user of the gather node.");
1229412397 assert(VL.size() % NumParts == 0 &&
1229512398 "Number of scalars must be divisible by NumParts.");
12399+ if (!TE->UserTreeIndices.empty() &&
12400+ TE->UserTreeIndices.front().UserTE->isGather() &&
12401+ TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
12402+ assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
12403+ isSplat(TE->Scalars)) &&
12404+ "Expected splat or extractelements only node.");
12405+ return {};
12406+ }
1229612407 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
1229712408 SmallVector<std::optional<TTI::ShuffleKind>> Res;
1229812409 for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -17119,7 +17230,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1711917230 if (R.isGathered(Chain.front()) ||
1712017231 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
1712117232 return std::nullopt;
17122- Size = R.getTreeSize ();
17233+ Size = R.getCanonicalGraphSize ();
1712317234 return false;
1712417235 }
1712517236 R.reorderTopToBottom();
@@ -17129,7 +17240,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1712917240
1713017241 R.computeMinimumValueSizes();
1713117242
17132- Size = R.getTreeSize ();
17243+ Size = R.getCanonicalGraphSize ();
1713317244 if (S.getOpcode() == Instruction::Load)
1713417245 Size = 2; // cut off masked gather small trees
1713517246 InstructionCost Cost = R.getTreeCost();
0 commit comments