Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 64 additions & 56 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -987,7 +987,7 @@ class LoopVectorizationCostModel {
InterleavedAccessInfo &IAI)
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
Hints(Hints), InterleaveInfo(IAI) {}
Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}

/// \return An upper bound for the vectorization factors (both fixed and
/// scalable). If the factors are 0, vectorization and interleaving should be
Expand Down Expand Up @@ -1553,9 +1553,9 @@ class LoopVectorizationCostModel {

/// Return the cost of instructions in an inloop reduction pattern, if I is
/// part of that pattern.
std::optional<InstructionCost>
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
TTI::TargetCostKind CostKind) const;
std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
ElementCount VF,
Type *VectorTy) const;

/// Returns true if \p Op should be considered invariant and if it is
/// trivially hoistable.
Expand Down Expand Up @@ -1614,8 +1614,8 @@ class LoopVectorizationCostModel {

/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
TTI::TargetCostKind CostKind) const;
InstructionCost getScalarizationOverhead(Instruction *I,
ElementCount VF) const;

/// Returns true if an artificially high cost for emulated masked memrefs
/// should be used.
Expand Down Expand Up @@ -1796,6 +1796,9 @@ class LoopVectorizationCostModel {

/// All element types found in the loop.
SmallPtrSet<Type *, 16> ElementTypesInLoop;

/// The kind of cost that we are calculating
TTI::TargetCostKind CostKind;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think somewhere in the debug output it would be useful to tell the user what cost model we're using now that it's variable. For example, perhaps in LoopVectorizationPlanner::computeBestVF before we start calculating the costs you could print out the cost model being used.

};
} // end namespace llvm

Expand Down Expand Up @@ -1836,13 +1839,17 @@ class GeneratedRTChecks {

PredicatedScalarEvolution &PSE;

/// The kind of cost that we are calculating
TTI::TargetCostKind CostKind;

public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
const DataLayout &DL, bool AddBranchWeights)
const DataLayout &DL, bool AddBranchWeights,
TTI::TargetCostKind CostKind)
: DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
MemCheckExp(*PSE.getSE(), DL, "scev.check"),
AddBranchWeights(AddBranchWeights), PSE(PSE) {}
AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}

/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
Expand Down Expand Up @@ -1954,8 +1961,7 @@ class GeneratedRTChecks {
for (Instruction &I : *SCEVCheckBlock) {
if (SCEVCheckBlock->getTerminator() == &I)
continue;
InstructionCost C =
TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
InstructionCost C = TTI->getInstructionCost(&I, CostKind);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
RTCheckCost += C;
}
Expand All @@ -1964,8 +1970,7 @@ class GeneratedRTChecks {
for (Instruction &I : *MemCheckBlock) {
if (MemCheckBlock->getTerminator() == &I)
continue;
InstructionCost C =
TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
InstructionCost C = TTI->getInstructionCost(&I, CostKind);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
MemCheckCost += C;
}
Expand Down Expand Up @@ -2926,10 +2931,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
if (!VF.isScalar())
return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *RetTy = CI->getType();
if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
return *RedCost;

SmallVector<Type *, 4> Tys;
Expand Down Expand Up @@ -2972,8 +2976,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,

IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
dyn_cast<IntrinsicInst>(CI));
return TTI.getIntrinsicInstrCost(CostAttrs,
TargetTransformInfo::TCK_RecipThroughput);
return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
}

void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
Expand Down Expand Up @@ -3430,8 +3433,6 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
I->getOpcode() == Instruction::URem);
assert(!isSafeToSpeculativelyExecute(I));

const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

// Scalarization isn't legal for scalable vector types
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
if (!VF.isScalable()) {
Expand All @@ -3453,7 +3454,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,

// The cost of insertelement and extractelement instructions needed for
// scalarization.
ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
ScalarizationCost += getScalarizationOverhead(I, VF);

// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
Expand Down Expand Up @@ -4426,7 +4427,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
for (const auto &Plan : VPlans) {
for (ElementCount VF : Plan->vectorFactors()) {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
CM);
CM, CM.CostKind);
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
Expand Down Expand Up @@ -5576,7 +5577,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(

// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(toVectorTy(I->getType(), VF)),
Expand Down Expand Up @@ -5723,15 +5723,14 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
const Align Alignment = getLoadStoreAlignment(I);
Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
ValTy->getScalarType(),
Alignment, AS, CostKind);

// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
Cost += getScalarizationOverhead(I, VF, CostKind);
Cost += getScalarizationOverhead(I, VF);

// If we have a predicated load/store, it will need extra i1 extracts and
// conditional branches, but may not be executed for each vector lane. Scale
Expand Down Expand Up @@ -5764,7 +5763,6 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
Expand Down Expand Up @@ -5795,12 +5793,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isa<LoadInst>(I)) {
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {},
CostKind);
}
StoreInst *SI = cast<StoreInst>(I);

Expand All @@ -5823,9 +5821,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
const Value *Ptr = getLoadStorePointerOperand(I);

return TTI.getAddressComputationCost(VectorTy) +
TTI.getGatherScatterOpCost(
I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
TargetTransformInfo::TCK_RecipThroughput, I);
TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
Legal->isMaskRequired(I), Alignment,
CostKind, I);
}

InstructionCost
Expand All @@ -5838,7 +5836,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Type *ValTy = getLoadStoreType(InsertPos);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(InsertPos);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

unsigned InterleaveFactor = Group->getFactor();
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
Expand Down Expand Up @@ -5870,9 +5867,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
}

std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(
Instruction *I, ElementCount VF, Type *Ty,
TTI::TargetCostKind CostKind) const {
LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
ElementCount VF,
Type *Ty) const {
using namespace llvm::PatternMatch;
// Early exit for no inloop reductions
if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
Expand Down Expand Up @@ -6063,14 +6060,15 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,

TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
TTI::TCK_RecipThroughput, OpInfo, I);
TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
OpInfo, I);
}
return getWideningCost(I, VF);
}

InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
InstructionCost
LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
ElementCount VF) const {

// There is no mechanism yet to create a scalable scalarization loop,
// so this is currently Invalid.
Expand Down Expand Up @@ -6313,7 +6311,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
InstructionCost ScalarCost = InstructionCost::getInvalid();
InstructionCost VectorCost = InstructionCost::getInvalid();
InstructionCost IntrinsicCost = InstructionCost::getInvalid();
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Function *ScalarFunc = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
Expand All @@ -6329,8 +6326,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {

// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
InstructionCost ScalarizationCost =
getScalarizationOverhead(CI, VF, CostKind);
InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);

ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
// Honor ForcedScalars and UniformAfterVectorization decisions.
Expand All @@ -6354,7 +6350,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
// An in-loop reduction using an fmuladd intrinsic is a special case;
// we don't want the normal cost for that intrinsic.
if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
getVectorIntrinsicIDForCall(CI, TLI),
std::nullopt, *RedCost);
Expand Down Expand Up @@ -6439,7 +6435,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
TargetTransformInfo::SK_Broadcast,
VectorType::get(IntegerType::getInt1Ty(
VecFunc->getFunctionType()->getContext()),
VF));
VF),
{}, CostKind);

if (TLI && VecFunc && !CI->isNoBuiltin())
VectorCost =
Expand Down Expand Up @@ -6507,7 +6504,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
auto *SE = PSE.getSE();
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

auto HasSingleCopyAfterVectorization = [this](Instruction *I,
ElementCount VF) -> bool {
Expand Down Expand Up @@ -6683,7 +6679,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
InstructionCost MulCost = TTI::TCC_Free;
ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
if (!RHS || RHS->getZExtValue() != 1)
MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
MulCost =
TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);

// Find the cost of the histogram operation itself.
Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
Expand All @@ -6694,9 +6691,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
{PtrTy, ScalarTy, MaskTy});

// Add the costs together with the add/sub operation.
return TTI.getIntrinsicInstrCost(
ICA, TargetTransformInfo::TCK_RecipThroughput) +
MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
}
[[fallthrough]];
}
Expand All @@ -6721,7 +6717,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return 0;

// Detect reduction patterns
if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
return *RedCost;

// Certain instructions can be cheaper to vectorize if they have a constant
Expand Down Expand Up @@ -6886,7 +6882,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}

// Detect reduction patterns
if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
return *RedCost;

Type *SrcScalarTy = I->getOperand(0)->getType();
Expand All @@ -6911,7 +6907,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
case Instruction::Call:
return getVectorCallCost(cast<CallInst>(I), VF);
case Instruction::ExtractValue:
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
return TTI.getInstructionCost(I, CostKind);
case Instruction::Alloca:
// We cannot easily widen alloca to a scalable alloca, as
// the result would need to be a vector of pointers.
Expand Down Expand Up @@ -7423,8 +7419,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,

// Pre-compute the cost for I, if it has a reduction pattern cost.
for (Instruction *I : ChainOpsAndOperands) {
auto ReductionCost = CM.getReductionPatternCost(
I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
auto ReductionCost =
CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
if (!ReductionCost)
continue;

Expand Down Expand Up @@ -7482,7 +7478,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,

InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
CM.CostKind);
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);

// Now compute and add the VPlan-based cost.
Expand Down Expand Up @@ -7558,6 +7555,16 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
return {*FirstPlan.vectorFactors().begin(), 0, 0};

LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
<< (CM.CostKind == TTI::TCK_RecipThroughput
? "Reciprocal Throughput\n"
: CM.CostKind == TTI::TCK_Latency
? "Instruction Latency\n"
: CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
: CM.CostKind == TTI::TCK_SizeAndLatency
? "Code Size and Latency\n"
: "Unknown\n"));

ElementCount ScalarVF = ElementCount::getFixed(1);
assert(hasPlanWithVF(ScalarVF) &&
"More than a single plan/VF w/o any plan having scalar VF");
Expand Down Expand Up @@ -7611,7 +7618,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// simplifications not accounted for in the legacy cost model. If that's the
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
CM.CostKind);
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
assert((BestFactor.Width == LegacyVF.Width ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
Expand Down Expand Up @@ -9971,7 +9979,7 @@ static bool processLoopInVPlanNativePath(
bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
AddBranchWeights);
AddBranchWeights, CM.CostKind);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
Expand Down Expand Up @@ -10488,7 +10496,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
AddBranchWeights);
AddBranchWeights, CM.CostKind);
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
Expand Down
Loading
Loading