@@ -1126,6 +1126,9 @@ class BoUpSLP {
11261126 void
11271127 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
11281128
1129+ /// Transforms graph nodes to target specific representations, if profitable.
1130+ void transformNodes();
1131+
11291132 /// Clear the internal data structures that are created by 'buildTree'.
11301133 void deleteTree() {
11311134 VectorizableTree.clear();
@@ -7813,6 +7816,43 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
78137816 return std::make_pair(ScalarCost, VecCost);
78147817}
78157818
7819+ void BoUpSLP::transformNodes() {
7820+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7821+ for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7822+ TreeEntry &E = *TE.get();
7823+ switch (E.getOpcode()) {
7824+ case Instruction::Load: {
7825+ Type *ScalarTy = E.getMainOp()->getType();
7826+ auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
7827+ Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
7828+ // Check if profitable to represent consecutive load + reverse as strided
7829+ // load with stride -1.
7830+ if (isReverseOrder(E.ReorderIndices) &&
7831+ TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
7832+ SmallVector<int> Mask;
7833+ inversePermutation(E.ReorderIndices, Mask);
7834+ auto *BaseLI = cast<LoadInst>(E.Scalars.back());
7835+ InstructionCost OriginalVecCost =
7836+ TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
7837+ BaseLI->getPointerAddressSpace(), CostKind,
7838+ TTI::OperandValueInfo()) +
7839+ ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
7840+ InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
7841+ Instruction::Load, VecTy, BaseLI->getPointerOperand(),
7842+ /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
7843+ if (StridedCost < OriginalVecCost)
7844+ // Strided load is more profitable than consecutive load + reverse -
7845+ // transform the node to strided load.
7846+ E.State = TreeEntry::StridedVectorize;
7847+ }
7848+ break;
7849+ }
7850+ default:
7851+ break;
7852+ }
7853+ }
7854+ }
7855+
78167856/// Merges shuffle masks and emits final shuffle instruction, if required. It
78177857/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
78187858/// when the actual shuffle instruction is generated only if this is actually
@@ -15189,6 +15229,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1518915229 R.buildExternalUses();
1519015230
1519115231 R.computeMinimumValueSizes();
15232+ R.transformNodes();
1519215233
1519315234 InstructionCost Cost = R.getTreeCost();
1519415235
@@ -15567,6 +15608,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1556715608 R.buildExternalUses();
1556815609
1556915610 R.computeMinimumValueSizes();
15611+ R.transformNodes();
1557015612 InstructionCost Cost = R.getTreeCost();
1557115613 CandidateFound = true;
1557215614 MinCost = std::min(MinCost, Cost);
@@ -16563,6 +16605,7 @@ class HorizontalReduction {
1656316605 V.buildExternalUses(LocalExternallyUsedValues);
1656416606
1656516607 V.computeMinimumValueSizes();
16608+ V.transformNodes();
1656616609
1656716610 // Estimate cost.
1656816611 InstructionCost TreeCost = V.getTreeCost(VL);
0 commit comments