Skip to content

Commit 8f2466b

Browse files
authored
Reapply "[VPlan] Compute cost of more replicating loads/stores in ::computeCost. (#160053)" (#161724)
This reverts commit f61be43. Recommit a small fix handling scalarization overhead consistently with legacy cost model if a load is used directly as operand of another memory operation, which fixes #161404. Original message: Update VPReplicateRecipe::computeCost to compute costs of more replicating loads/stores. There are 2 cases that require extra checks to match the legacy cost model: 1. If the pointer is based on an induction, the legacy cost model passes its SCEV to getAddressComputationCost. In those cases, still fall back to the legacy cost. SCEV computations will be added as follow-up 2. If a load is used as part of an address of another load, the legacy cost model skips the scalarization overhead. Those cases are currently handled by a usedByLoadOrStore helper. Note that getScalarizationOverhead also needs updating, because when the legacy cost model computes the scalarization overhead, scalars have not been collected yet, so we can't each for replicating recipes to skip their cost, except other loads. This again can be further improved by modeling inserts/extracts explicitly and consistently, and compute costs for those operations directly where needed. PR: #160053
1 parent f4784fd commit 8f2466b

File tree

4 files changed

+134
-29
lines changed

4 files changed

+134
-29
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3903,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
39033903
if (VF.isScalar())
39043904
continue;
39053905

3906-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
3906+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
3907+
*CM.PSE.getSE());
39073908
precomputeCosts(*Plan, VF, CostCtx);
39083909
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
39093910
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4160,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
41604161

41614162
// Add on other costs that are modelled in VPlan, but not in the legacy
41624163
// cost model.
4163-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
4164+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
4165+
*CM.PSE.getSE());
41644166
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
41654167
assert(VectorRegion && "Expected to have a vector region!");
41664168
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6852,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
68526854

68536855
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
68546856
ElementCount VF) const {
6855-
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
6857+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
68566858
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
68576859

68586860
// Now compute and add the VPlan-based cost.
@@ -7085,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
70857087
// simplifications not accounted for in the legacy cost model. If that's the
70867088
// case, don't trigger the assertion, as the extra simplifications may cause a
70877089
// different VF to be picked by the VPlan-based cost model.
7088-
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
7090+
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
7091+
*CM.PSE.getSE());
70897092
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
70907093
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
70917094
// with early exits and plans with additional VPlan simplifications. The
@@ -8621,7 +8624,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
86218624
// TODO: Enable following transform when the EVL-version of extended-reduction
86228625
// and mulacc-reduction are implemented.
86238626
if (!CM.foldTailWithEVL()) {
8624-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
8627+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
8628+
*CM.PSE.getSE());
86258629
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
86268630
CostCtx, Range);
86278631
}
@@ -10075,7 +10079,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1007510079
bool ForceVectorization =
1007610080
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1007710081
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
10078-
CM.CostKind);
10082+
CM.CostKind, *CM.PSE.getSE());
1007910083
if (!ForceVectorization &&
1008010084
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
1008110085
LVP.getPlanFor(VF.Width), SEL,

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1772,7 +1772,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
17721772
}
17731773

17741774
InstructionCost VPCostContext::getScalarizationOverhead(
1775-
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
1775+
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
1776+
bool AlwaysIncludeReplicatingR) {
17761777
if (VF.isScalar())
17771778
return 0;
17781779

@@ -1792,7 +1793,11 @@ InstructionCost VPCostContext::getScalarizationOverhead(
17921793
SmallPtrSet<const VPValue *, 4> UniqueOperands;
17931794
SmallVector<Type *> Tys;
17941795
for (auto *Op : Operands) {
1795-
if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
1796+
if (Op->isLiveIn() ||
1797+
(!AlwaysIncludeReplicatingR &&
1798+
isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
1799+
(isa<VPReplicateRecipe>(Op) &&
1800+
cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
17961801
!UniqueOperands.insert(Op).second)
17971802
continue;
17981803
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -349,12 +349,14 @@ struct VPCostContext {
349349
LoopVectorizationCostModel &CM;
350350
SmallPtrSet<Instruction *, 8> SkipCostComputation;
351351
TargetTransformInfo::TargetCostKind CostKind;
352+
ScalarEvolution &SE;
352353

353354
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
354355
const VPlan &Plan, LoopVectorizationCostModel &CM,
355-
TargetTransformInfo::TargetCostKind CostKind)
356+
TargetTransformInfo::TargetCostKind CostKind,
357+
ScalarEvolution &SE)
356358
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
357-
CostKind(CostKind) {}
359+
CostKind(CostKind), SE(SE) {}
358360

359361
/// Return the cost for \p UI with \p VF using the legacy cost model as
360362
/// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
374376

375377
/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
376378
/// and \p Operands with \p VF. This is a convenience wrapper for the
377-
/// type-based getScalarizationOverhead API.
378-
InstructionCost getScalarizationOverhead(Type *ResultTy,
379-
ArrayRef<const VPValue *> Operands,
380-
ElementCount VF);
379+
/// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
380+
/// is true, always compute the cost of scalarizing replicating operands.
381+
InstructionCost
382+
getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
383+
ElementCount VF,
384+
bool AlwaysIncludeReplicatingR = false);
381385
};
382386

383387
/// This class can be used to assign names to VPValues. For VPValues without

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 107 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include <cassert>
4141

4242
using namespace llvm;
43+
using namespace llvm::VPlanPatternMatch;
4344

4445
using VectorParts = SmallVector<Value *, 2>;
4546

@@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
303304
VPRecipeBase *OpR = Op->getDefiningRecipe();
304305

305306
// If the partial reduction is predicated, a select will be operand 0
306-
using namespace llvm::VPlanPatternMatch;
307307
if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
308308
OpR = Op->getDefiningRecipe();
309309
}
@@ -1963,7 +1963,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
19631963
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
19641964

19651965
VPValue *Op0, *Op1;
1966-
using namespace llvm::VPlanPatternMatch;
19671966
if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
19681967
(match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
19691968
match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
@@ -3111,6 +3110,62 @@ bool VPReplicateRecipe::shouldPack() const {
31113110
});
31123111
}
31133112

3113+
/// Returns true if \p Ptr is a pointer computation for which the legacy cost
3114+
/// model computes a SCEV expression when computing the address cost.
3115+
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
3116+
auto *PtrR = Ptr->getDefiningRecipe();
3117+
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3118+
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
3119+
Instruction::GetElementPtr) ||
3120+
isa<VPWidenGEPRecipe>(PtrR) ||
3121+
match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
3122+
return false;
3123+
3124+
// We are looking for a GEP where all indices are either loop invariant or
3125+
// inductions.
3126+
for (VPValue *Opd : drop_begin(PtrR->operands())) {
3127+
if (!Opd->isDefinedOutsideLoopRegions() &&
3128+
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
3129+
return false;
3130+
}
3131+
3132+
return true;
3133+
}
3134+
3135+
/// Returns true if \p V is used as part of the address of another load or
3136+
/// store.
3137+
static bool isUsedByLoadStoreAddress(const VPUser *V) {
3138+
SmallPtrSet<const VPUser *, 4> Seen;
3139+
SmallVector<const VPUser *> WorkList = {V};
3140+
3141+
while (!WorkList.empty()) {
3142+
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3143+
if (!Cur || !Seen.insert(Cur).second)
3144+
continue;
3145+
3146+
for (VPUser *U : Cur->users()) {
3147+
if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3148+
if (InterleaveR->getAddr() == Cur)
3149+
return true;
3150+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3151+
if (RepR->getOpcode() == Instruction::Load &&
3152+
RepR->getOperand(0) == Cur)
3153+
return true;
3154+
if (RepR->getOpcode() == Instruction::Store &&
3155+
RepR->getOperand(1) == Cur)
3156+
return true;
3157+
}
3158+
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3159+
if (MemR->getAddr() == Cur && MemR->isConsecutive())
3160+
return true;
3161+
}
3162+
}
3163+
3164+
append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
3165+
}
3166+
return false;
3167+
}
3168+
31143169
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31153170
VPCostContext &Ctx) const {
31163171
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3218,21 +3273,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
32183273
}
32193274
case Instruction::Load:
32203275
case Instruction::Store: {
3221-
if (isSingleScalar()) {
3222-
bool IsLoad = UI->getOpcode() == Instruction::Load;
3223-
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3224-
Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
3225-
const Align Alignment = getLoadStoreAlignment(UI);
3226-
unsigned AS = getLoadStoreAddressSpace(UI);
3227-
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3228-
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3229-
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
3230-
return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3231-
ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
3232-
}
3276+
if (VF.isScalable() && !isSingleScalar())
3277+
return InstructionCost::getInvalid();
3278+
32333279
// TODO: See getMemInstScalarizationCost for how to handle replicating and
32343280
// predicated cases.
3235-
break;
3281+
const VPRegionBlock *ParentRegion = getParent()->getParent();
3282+
if (ParentRegion && ParentRegion->isReplicator())
3283+
break;
3284+
3285+
bool IsLoad = UI->getOpcode() == Instruction::Load;
3286+
const VPValue *PtrOp = getOperand(!IsLoad);
3287+
// TODO: Handle cases where we need to pass a SCEV to
3288+
// getAddressComputationCost.
3289+
if (shouldUseAddressAccessSCEV(PtrOp))
3290+
break;
3291+
3292+
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3293+
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3294+
const Align Alignment = getLoadStoreAlignment(UI);
3295+
unsigned AS = getLoadStoreAddressSpace(UI);
3296+
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3297+
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3298+
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3299+
3300+
Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3301+
3302+
InstructionCost ScalarCost =
3303+
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3304+
PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
3305+
if (isSingleScalar())
3306+
return ScalarCost;
3307+
3308+
SmallVector<const VPValue *> OpsToScalarize;
3309+
Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3310+
// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3311+
// don't assign scalarization overhead in general, if the target prefers
3312+
// vectorized addressing or the loaded value is used as part of an address
3313+
// of another load or store.
3314+
bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3315+
if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
3316+
bool EfficientVectorLoadStore =
3317+
Ctx.TTI.supportsEfficientVectorElementLoadStore();
3318+
if (!(IsLoad && !PreferVectorizedAddressing) &&
3319+
!(!IsLoad && EfficientVectorLoadStore))
3320+
append_range(OpsToScalarize, operands());
3321+
3322+
if (!EfficientVectorLoadStore)
3323+
ResultTy = Ctx.Types.inferScalarType(this);
3324+
}
3325+
3326+
return (ScalarCost * VF.getFixedValue()) +
3327+
Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
32363328
}
32373329
}
32383330

0 commit comments

Comments
 (0)