Skip to content

Commit 770cb16

Browse files
committed
Reapply "Reapply "[VPlan] Compute cost of more replicating loads/stores in ::computeCost. (llvm#160053)" (llvm#161724)"
This reverts commit f80c0ba.
1 parent 5d3b605 commit 770cb16

File tree

4 files changed

+134
-29
lines changed

4 files changed

+134
-29
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3903,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
39033903
if (VF.isScalar())
39043904
continue;
39053905

3906-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
3906+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
3907+
*CM.PSE.getSE());
39073908
precomputeCosts(*Plan, VF, CostCtx);
39083909
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
39093910
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4160,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
41604161

41614162
// Add on other costs that are modelled in VPlan, but not in the legacy
41624163
// cost model.
4163-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
4164+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
4165+
*CM.PSE.getSE());
41644166
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
41654167
assert(VectorRegion && "Expected to have a vector region!");
41664168
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6852,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
68526854

68536855
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
68546856
ElementCount VF) const {
6855-
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
6857+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
68566858
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
68576859

68586860
// Now compute and add the VPlan-based cost.
@@ -7085,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
70857087
// simplifications not accounted for in the legacy cost model. If that's the
70867088
// case, don't trigger the assertion, as the extra simplifications may cause a
70877089
// different VF to be picked by the VPlan-based cost model.
7088-
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
7090+
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
7091+
*CM.PSE.getSE());
70897092
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
70907093
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
70917094
// with early exits and plans with additional VPlan simplifications. The
@@ -8418,7 +8421,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
84188421
// TODO: Enable following transform when the EVL-version of extended-reduction
84198422
// and mulacc-reduction are implemented.
84208423
if (!CM.foldTailWithEVL()) {
8421-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
8424+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
8425+
*CM.PSE.getSE());
84228426
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
84238427
CostCtx, Range);
84248428
}
@@ -9874,7 +9878,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
98749878
bool ForceVectorization =
98759879
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
98769880
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
9877-
CM.CostKind);
9881+
CM.CostKind, *CM.PSE.getSE());
98789882
if (!ForceVectorization &&
98799883
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
98809884
LVP.getPlanFor(VF.Width), SEL,

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1772,7 +1772,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
17721772
}
17731773

17741774
InstructionCost VPCostContext::getScalarizationOverhead(
1775-
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
1775+
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
1776+
bool AlwaysIncludeReplicatingR) {
17761777
if (VF.isScalar())
17771778
return 0;
17781779

@@ -1792,7 +1793,11 @@ InstructionCost VPCostContext::getScalarizationOverhead(
17921793
SmallPtrSet<const VPValue *, 4> UniqueOperands;
17931794
SmallVector<Type *> Tys;
17941795
for (auto *Op : Operands) {
1795-
if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
1796+
if (Op->isLiveIn() ||
1797+
(!AlwaysIncludeReplicatingR &&
1798+
isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
1799+
(isa<VPReplicateRecipe>(Op) &&
1800+
cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
17961801
!UniqueOperands.insert(Op).second)
17971802
continue;
17981803
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -349,12 +349,14 @@ struct VPCostContext {
349349
LoopVectorizationCostModel &CM;
350350
SmallPtrSet<Instruction *, 8> SkipCostComputation;
351351
TargetTransformInfo::TargetCostKind CostKind;
352+
ScalarEvolution &SE;
352353

353354
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
354355
const VPlan &Plan, LoopVectorizationCostModel &CM,
355-
TargetTransformInfo::TargetCostKind CostKind)
356+
TargetTransformInfo::TargetCostKind CostKind,
357+
ScalarEvolution &SE)
356358
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
357-
CostKind(CostKind) {}
359+
CostKind(CostKind), SE(SE) {}
358360

359361
/// Return the cost for \p UI with \p VF using the legacy cost model as
360362
/// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
374376

375377
/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
376378
/// and \p Operands with \p VF. This is a convenience wrapper for the
377-
/// type-based getScalarizationOverhead API.
378-
InstructionCost getScalarizationOverhead(Type *ResultTy,
379-
ArrayRef<const VPValue *> Operands,
380-
ElementCount VF);
379+
/// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
380+
/// is true, always compute the cost of scalarizing replicating operands.
381+
InstructionCost
382+
getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
383+
ElementCount VF,
384+
bool AlwaysIncludeReplicatingR = false);
381385
};
382386

383387
/// This class can be used to assign names to VPValues. For VPValues without

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 107 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include <cassert>
4141

4242
using namespace llvm;
43+
using namespace llvm::VPlanPatternMatch;
4344

4445
using VectorParts = SmallVector<Value *, 2>;
4546

@@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
303304
VPRecipeBase *OpR = Op->getDefiningRecipe();
304305

305306
// If the partial reduction is predicated, a select will be operand 0
306-
using namespace llvm::VPlanPatternMatch;
307307
if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
308308
OpR = Op->getDefiningRecipe();
309309
}
@@ -1963,7 +1963,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
19631963
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
19641964

19651965
VPValue *Op0, *Op1;
1966-
using namespace llvm::VPlanPatternMatch;
19671966
if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
19681967
(match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
19691968
match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
@@ -3111,6 +3110,62 @@ bool VPReplicateRecipe::shouldPack() const {
31113110
});
31123111
}
31133112

3113+
/// Returns true if \p Ptr is a pointer computation for which the legacy cost
3114+
/// model computes a SCEV expression when computing the address cost.
3115+
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
3116+
auto *PtrR = Ptr->getDefiningRecipe();
3117+
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3118+
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
3119+
Instruction::GetElementPtr) ||
3120+
isa<VPWidenGEPRecipe>(PtrR) ||
3121+
match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
3122+
return false;
3123+
3124+
// We are looking for a GEP where all indices are either loop invariant or
3125+
// inductions.
3126+
for (VPValue *Opd : drop_begin(PtrR->operands())) {
3127+
if (!Opd->isDefinedOutsideLoopRegions() &&
3128+
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
3129+
return false;
3130+
}
3131+
3132+
return true;
3133+
}
3134+
3135+
/// Returns true if \p V is used as part of the address of another load or
3136+
/// store.
3137+
static bool isUsedByLoadStoreAddress(const VPUser *V) {
3138+
SmallPtrSet<const VPUser *, 4> Seen;
3139+
SmallVector<const VPUser *> WorkList = {V};
3140+
3141+
while (!WorkList.empty()) {
3142+
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3143+
if (!Cur || !Seen.insert(Cur).second)
3144+
continue;
3145+
3146+
for (VPUser *U : Cur->users()) {
3147+
if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3148+
if (InterleaveR->getAddr() == Cur)
3149+
return true;
3150+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3151+
if (RepR->getOpcode() == Instruction::Load &&
3152+
RepR->getOperand(0) == Cur)
3153+
return true;
3154+
if (RepR->getOpcode() == Instruction::Store &&
3155+
RepR->getOperand(1) == Cur)
3156+
return true;
3157+
}
3158+
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3159+
if (MemR->getAddr() == Cur && MemR->isConsecutive())
3160+
return true;
3161+
}
3162+
}
3163+
3164+
append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
3165+
}
3166+
return false;
3167+
}
3168+
31143169
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31153170
VPCostContext &Ctx) const {
31163171
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3218,21 +3273,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
32183273
}
32193274
case Instruction::Load:
32203275
case Instruction::Store: {
3221-
if (isSingleScalar()) {
3222-
bool IsLoad = UI->getOpcode() == Instruction::Load;
3223-
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3224-
Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
3225-
const Align Alignment = getLoadStoreAlignment(UI);
3226-
unsigned AS = getLoadStoreAddressSpace(UI);
3227-
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3228-
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3229-
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
3230-
return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3231-
ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
3232-
}
3276+
if (VF.isScalable() && !isSingleScalar())
3277+
return InstructionCost::getInvalid();
3278+
32333279
// TODO: See getMemInstScalarizationCost for how to handle replicating and
32343280
// predicated cases.
3235-
break;
3281+
const VPRegionBlock *ParentRegion = getParent()->getParent();
3282+
if (ParentRegion && ParentRegion->isReplicator())
3283+
break;
3284+
3285+
bool IsLoad = UI->getOpcode() == Instruction::Load;
3286+
const VPValue *PtrOp = getOperand(!IsLoad);
3287+
// TODO: Handle cases where we need to pass a SCEV to
3288+
// getAddressComputationCost.
3289+
if (shouldUseAddressAccessSCEV(PtrOp))
3290+
break;
3291+
3292+
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3293+
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3294+
const Align Alignment = getLoadStoreAlignment(UI);
3295+
unsigned AS = getLoadStoreAddressSpace(UI);
3296+
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3297+
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3298+
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3299+
3300+
Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3301+
3302+
InstructionCost ScalarCost =
3303+
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3304+
PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
3305+
if (isSingleScalar())
3306+
return ScalarCost;
3307+
3308+
SmallVector<const VPValue *> OpsToScalarize;
3309+
Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3310+
// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3311+
// don't assign scalarization overhead in general, if the target prefers
3312+
// vectorized addressing or the loaded value is used as part of an address
3313+
// of another load or store.
3314+
bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3315+
if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
3316+
bool EfficientVectorLoadStore =
3317+
Ctx.TTI.supportsEfficientVectorElementLoadStore();
3318+
if (!(IsLoad && !PreferVectorizedAddressing) &&
3319+
!(!IsLoad && EfficientVectorLoadStore))
3320+
append_range(OpsToScalarize, operands());
3321+
3322+
if (!EfficientVectorLoadStore)
3323+
ResultTy = Ctx.Types.inferScalarType(this);
3324+
}
3325+
3326+
return (ScalarCost * VF.getFixedValue()) +
3327+
Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
32363328
}
32373329
}
32383330

0 commit comments

Comments
 (0)