Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 180 additions & 13 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,11 @@ static cl::opt<bool> VectorizeCopyableElements(
cl::desc("Try to replace values with the idempotent instructions for "
"better vectorization."));

static cl::opt<unsigned> LoopAwareMinTripCount(
"slp-cost-loop-min-trip-count", cl::init(1), cl::Hidden,
cl::desc("Minimum loop trip count, considered by the cost model during "
"modeling (0=loops are ignored and considered flat code)"));

// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
Expand Down Expand Up @@ -2050,6 +2055,7 @@ class BoUpSLP {
UserIgnoreList = nullptr;
PostponedGathers.clear();
ValueToGatherNodes.clear();
LoopNest.clear();
}

unsigned getTreeSize() const { return VectorizableTree.size(); }
Expand Down Expand Up @@ -3592,6 +3598,15 @@ class BoUpSLP {
TargetTransformInfo::CastContextHint
getCastContextHint(const TreeEntry &TE) const;

/// \returns the scale of the given tree entry to the loop iteration.
/// \p Scalar is the scalar value from entry, if using the parent for the
/// external use.
/// \p U is the user of the vectorized value from entry, if using the parent
/// for the external use.
unsigned getScaleToLoopIterations(const TreeEntry &TE,
Value *Scalar = nullptr,
Instruction *U = nullptr) const;

/// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(const TreeEntry *E,
ArrayRef<Value *> VectorizedVals,
Expand Down Expand Up @@ -4492,6 +4507,10 @@ class BoUpSLP {
std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
CompressEntryToData;

/// The loop nest, used to check if only single loop nest is vectorized, not
/// multiple, to avoid side-effects fronm loop-aware cost model.
SmallVector<const Loop *> LoopNest;

/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
Expand Down Expand Up @@ -9172,13 +9191,77 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
return {IntrinsicCost, LibCost};
}

/// Find the innermost loop starting from \p L, for which at least single value
/// in \p VL is not invariant.
static const Loop *findInnermostNonInvariantLoop(const Loop *L,
ArrayRef<Value *> VL) {
assert(L && "Expected valid loop");
auto IsLoopInvariant = [&](const Loop *L, ArrayRef<Value *> VL) {
return all_of(VL, [&](Value *V) { return L->isLoopInvariant(V); });
};
while (L && IsLoopInvariant(L, VL))
L = L->getParentLoop();
return L;
}

/// Get the loop nest for the given loop.
static SmallVector<const Loop *> getLoopNest(const Loop *L) {
assert(L && "Expected valid loop");
SmallVector<const Loop *> LoopNest;
if (LoopAwareMinTripCount == 0)
return LoopNest;
while (L) {
LoopNest.push_back(L);
L = L->getParentLoop();
}
SmallVector<const Loop *> Res(LoopNest.rbegin(), LoopNest.rend());
return Res;
}

BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
const InstructionsState &S, ArrayRef<Value *> VL,
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
SmallVectorImpl<Value *> &PointerOps) {
assert(S.getMainOp() &&
"Expected instructions with same/alternate opcodes only.");

// Check the loop nest. Need to be sure, we handle single loop nest at the
// time to avoid incorrect cost estimation because of the loop aware cost
// model.
if (VectorizableTree.empty()) {
assert(LoopNest.empty() && "Expected empty loop nest");
// Process the first node? Initial fill of the loop nest.
BasicBlock *Parent = S.getMainOp()->getParent();
if (const Loop *L = LI->getLoopFor(Parent)) {
L = findInnermostNonInvariantLoop(L, VL);
if (L)
LoopNest = getLoopNest(L);
}
} else {
BasicBlock *Parent = S.getMainOp()->getParent();
if (const Loop *L = LI->getLoopFor(Parent)) {
// Check that the new loop nest is not involved.
// Otherwise, mark it as a gather node.
L = findInnermostNonInvariantLoop(L, VL);
if (L) {
SmallVector<const Loop *> NewLoopNest = getLoopNest(L);
for (const auto [L1, L2] : zip_longest(LoopNest, NewLoopNest)) {
if (L1 && L2) {
if (*L1 != *L2) {
LLVM_DEBUG(dbgs() << "SLP: Different loop nest.\n");
return TreeEntry::NeedToGather;
}
continue;
}
if (!L2)
break;
assert(!L1 && "L1 is expected to be null");
LoopNest.push_back(*L2);
}
}
}
}

unsigned ShuffleOrOp =
S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
Instruction *VL0 = S.getMainOp();
Expand Down Expand Up @@ -13468,6 +13551,59 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
return TTI::CastContextHint::None;
}

/// Get the minimum loop trip count for the loop \p L.
static unsigned getLoopMinTripCount(const Loop *L, ScalarEvolution &SE) {
if (LoopAwareMinTripCount == 0)
return 1;
// Multiple exiting blocks - skip.
if (!L->getExitingBlock())
return LoopAwareMinTripCount;
if (unsigned Scale = SE.getSmallConstantTripCount(L))
return Scale;
return LoopAwareMinTripCount;
}

unsigned BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
Instruction *U) const {
unsigned Scale = 1;
if (TE.State == TreeEntry::SplitVectorize)
return Scale;
BasicBlock *Parent = nullptr;
if (U) {
Parent = U->getParent();
} else if (TE.isGather()) {
EdgeInfo EI = TE.UserTreeIndex;
while (EI.UserTE) {
if (EI.UserTE->isGather()) {
EI = EI.UserTE->UserTreeIndex;
continue;
}
if (EI.UserTE->State == TreeEntry::Vectorize &&
EI.UserTE->getOpcode() == Instruction::PHI) {
auto *PH = cast<PHINode>(EI.UserTE->getMainOp());
Parent = PH->getIncomingBlock(EI.EdgeIdx);
} else {
Parent = EI.UserTE->getMainOp()->getParent();
}
break;
}
if (!Parent)
return Scale;
} else {
Parent = TE.getMainOp()->getParent();
}
if (const Loop *L = LI->getLoopFor(Parent)) {
L = findInnermostNonInvariantLoop(L, Scalar ? ArrayRef(Scalar)
: ArrayRef(TE.Scalars));
if (L) {
SmallVector<const Loop *> Nest = getLoopNest(L);
for (const Loop *L : reverse(Nest))
Scale *= getLoopMinTripCount(L, *SE);
}
}
return Scale;
}

InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
Expand Down Expand Up @@ -14862,10 +14998,14 @@ InstructionCost BoUpSLP::getSpillCost() {
if (It != MinBWs.end())
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
auto *VecTy = getWidenedType(ScalarTy, Op->getVectorFactor());
Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
unsigned Scale = getScaleToLoopIterations(*Op);
InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(VecTy);
KeepLiveCost *= Scale;
Cost += KeepLiveCost;
if (ScalarTy->isVectorTy()) {
// Handle revec dead vector instructions.
Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy) *
Scale;
}
};
// Memoize the relationship between blocks, i.e. if there is (at least one)
Expand Down Expand Up @@ -15164,7 +15304,7 @@ template <typename T> struct ShuffledInsertData {

InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
InstructionCost ReductionCost) {
InstructionCost Cost = ReductionCost;
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");

Expand Down Expand Up @@ -15199,13 +15339,29 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
"Expected gather nodes with users only.");

InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts) *
getScaleToLoopIterations(TE);
Cost += C;
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
<< shortBundleName(TE.Scalars, TE.Idx) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
}

// Add reduced value cost, if resized.
Instruction *ReductionRoot = nullptr;
if (UserIgnoreList) {
const auto It = find_if(*UserIgnoreList, IsaPred<Instruction>);
assert(It != UserIgnoreList->end() && "Expected reduction instruction.");
ReductionRoot = cast<Instruction>(*It);
// Scale reuction cost to the factor of the loop nest trip count.
ReductionCost *=
getScaleToLoopIterations(*VectorizableTree.front().get(),
/*Scalar=*/nullptr, ReductionRoot);
}

// Add the cost for reduction.
Cost += ReductionCost;

if (Cost >= -SLPCostThreshold &&
none_of(ExternalUses, [](const ExternalUser &EU) {
return isa_and_nonnull<InsertElementInst>(EU.User);
Expand Down Expand Up @@ -15495,6 +15651,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
}
}

ExtraCost *= getScaleToLoopIterations(EU.E, EU.Scalar,
cast_or_null<Instruction>(EU.User));

ExtractCost += ExtraCost;
}
// Insert externals for extract of operands of casts to be emitted as scalars
Expand All @@ -15506,7 +15665,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
TEs.front()->findLaneForValue(V));
}
}
// Add reduced value cost, if resized.
if (!VectorizedVals.empty()) {
const TreeEntry &Root = *VectorizableTree.front();
auto BWIt = MinBWs.find(&Root);
Expand All @@ -15524,9 +15682,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
assert(SLPReVec && "Only supported by REVEC.");
SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
}
Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
TTI::CastContextHint::None,
TTI::TCK_RecipThroughput);
InstructionCost CastCost =
TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
TTI::CastContextHint::None,
TTI::TCK_RecipThroughput) *
getScaleToLoopIterations(Root, /*Scalar=*/nullptr, ReductionRoot);
Cost += CastCost;
}
}
}
Expand Down Expand Up @@ -15598,6 +15759,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
})) {
InstructionCost C =
::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FTy, Mask);
C *= getScaleToLoopIterations(*TEs.front());
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for final shuffle of insertelement "
"external users.\n";
Expand All @@ -15616,6 +15778,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
InstructionCost C =
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
C *= getScaleToLoopIterations(*TEs.back());
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for final shuffle of vector node and external "
"insertelement users.\n";
Expand Down Expand Up @@ -15669,7 +15832,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
auto *DstVecTy =
getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
TTI::CastContextHint CCH = getCastContextHint(E);
InstructionCost CastCost;
switch (E.getOpcode()) {
case Instruction::SExt:
case Instruction::ZExt:
Expand All @@ -15681,8 +15843,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
default:
break;
}
CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
TTI::TCK_RecipThroughput);
InstructionCost CastCost =
TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
TTI::TCK_RecipThroughput) *
getScaleToLoopIterations(*VectorizableTree.front().get(),
/*Scalar=*/nullptr, ReductionRoot);
Cost += CastCost;
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
<< " for final resize for reduction from " << SrcVecTy
Expand All @@ -15706,8 +15871,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
OS << *SpillCost;
else
OS << "<skipped>";
OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
<< "SLP: Total Cost = " << Cost << ".\n";
OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n";
if (ReductionRoot)
OS << "SLP: Reduction Cost = " << ReductionCost << ".\n";
OS << "SLP: Total Cost = " << Cost << ".\n";
}
LLVM_DEBUG(dbgs() << Str);
if (ViewSLPTree)
Expand Down
21 changes: 11 additions & 10 deletions llvm/test/Transforms/SLPVectorizer/X86/phi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -136,30 +136,31 @@ for.end: ; preds = %for.body
define float @foo3(ptr nocapture readonly %A) #0 {
; CHECK-LABEL: @foo3(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[A:%.*]], align 4
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP11:%.*]] = phi float [ [[TMP4]], [[ENTRY]] ], [ [[TMP21:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]]
; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 4
; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP21]] = load float, ptr [[ARRAYIDX24]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4
; CHECK-NEXT: [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
; CHECK-NEXT: [[TMP10]] = load float, ptr [[ARRAYIDX19]], align 4
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 5, i32 1, i32 2, i32 poison>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP11]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP21]], i32 3
; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x float> [[TMP13]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
; CHECK-NEXT: [[TMP15]] = fadd <4 x float> [[TMP3]], [[TMP14]]
; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ for.body: ; preds = %for.body, %entry
; YAML-NEXT: Function: foo
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
; YAML-NEXT: - Cost: '-5'
; YAML-NEXT: - Cost: '-40'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '4'

Expand All @@ -96,7 +96,7 @@ for.body: ; preds = %for.body, %entry
; YAML-NEXT: Function: foo
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
; YAML-NEXT: - Cost: '-7'
; YAML-NEXT: - Cost: '-56'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '1'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ for.body: ; preds = %for.body, %entry
; YAML-NEXT: Function: foo
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
; YAML-NEXT: - Cost: '-1'
; YAML-NEXT: - Cost: '-8'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '4'

Expand Down