Skip to content

Commit f6fc757

Browse files
fhahnmahesh-attarde
authored andcommitted
[VPlan] Compute cost of more replicating loads/stores in ::computeCost. (llvm#160053)
Update VPReplicateRecipe::computeCost to compute costs of more replicating loads/stores. There are 2 cases that require extra checks to match the legacy cost model: 1. If the pointer is based on an induction, the legacy cost model passes its SCEV to getAddressComputationCost. In those cases, still fall back to the legacy cost. SCEV computations will be added as follow-up 2. If a load is used as part of an address of another load, the legacy cost model skips the scalarization overhead. Those cases are currently handled by a usedByLoadOrStore helper. Note that getScalarizationOverhead also needs updating, because when the legacy cost model computes the scalarization overhead, scalars have not been collected yet, so we can't each for replicating recipes to skip their cost, except other loads. This again can be further improved by modeling inserts/extracts explicitly and consistently, and compute costs for those operations directly where needed. PR: llvm#160053
1 parent 8f5ffbf commit f6fc757

File tree

4 files changed

+130
-27
lines changed

4 files changed

+130
-27
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3902,7 +3902,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
39023902
if (VF.isScalar())
39033903
continue;
39043904

3905-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
3905+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
3906+
*CM.PSE.getSE());
39063907
precomputeCosts(*Plan, VF, CostCtx);
39073908
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
39083909
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4159,7 +4160,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
41594160

41604161
// Add on other costs that are modelled in VPlan, but not in the legacy
41614162
// cost model.
4162-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
4163+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
4164+
*CM.PSE.getSE());
41634165
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
41644166
assert(VectorRegion && "Expected to have a vector region!");
41654167
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6834,7 +6836,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
68346836

68356837
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
68366838
ElementCount VF) const {
6837-
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
6839+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
68386840
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
68396841

68406842
// Now compute and add the VPlan-based cost.
@@ -7067,7 +7069,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
70677069
// simplifications not accounted for in the legacy cost model. If that's the
70687070
// case, don't trigger the assertion, as the extra simplifications may cause a
70697071
// different VF to be picked by the VPlan-based cost model.
7070-
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
7072+
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
7073+
*CM.PSE.getSE());
70717074
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
70727075
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
70737076
// with early exits and plans with additional VPlan simplifications. The
@@ -8597,7 +8600,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
85978600
// TODO: Enable following transform when the EVL-version of extended-reduction
85988601
// and mulacc-reduction are implemented.
85998602
if (!CM.foldTailWithEVL()) {
8600-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
8603+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
8604+
*CM.PSE.getSE());
86018605
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
86028606
CostCtx, Range);
86038607
}
@@ -10054,7 +10058,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1005410058
bool ForceVectorization =
1005510059
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1005610060
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
10057-
CM.CostKind);
10061+
CM.CostKind, *CM.PSE.getSE());
1005810062
if (!ForceVectorization &&
1005910063
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
1006010064
LVP.getPlanFor(VF.Width), SEL,

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1750,7 +1750,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
17501750
}
17511751

17521752
InstructionCost VPCostContext::getScalarizationOverhead(
1753-
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
1753+
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
1754+
bool AlwaysIncludeReplicatingR) {
17541755
if (VF.isScalar())
17551756
return 0;
17561757

@@ -1770,7 +1771,9 @@ InstructionCost VPCostContext::getScalarizationOverhead(
17701771
SmallPtrSet<const VPValue *, 4> UniqueOperands;
17711772
SmallVector<Type *> Tys;
17721773
for (auto *Op : Operands) {
1773-
if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
1774+
if (Op->isLiveIn() ||
1775+
(!AlwaysIncludeReplicatingR &&
1776+
isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
17741777
!UniqueOperands.insert(Op).second)
17751778
continue;
17761779
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -349,12 +349,14 @@ struct VPCostContext {
349349
LoopVectorizationCostModel &CM;
350350
SmallPtrSet<Instruction *, 8> SkipCostComputation;
351351
TargetTransformInfo::TargetCostKind CostKind;
352+
ScalarEvolution &SE;
352353

353354
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
354355
const VPlan &Plan, LoopVectorizationCostModel &CM,
355-
TargetTransformInfo::TargetCostKind CostKind)
356+
TargetTransformInfo::TargetCostKind CostKind,
357+
ScalarEvolution &SE)
356358
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
357-
CostKind(CostKind) {}
359+
CostKind(CostKind), SE(SE) {}
358360

359361
/// Return the cost for \p UI with \p VF using the legacy cost model as
360362
/// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
374376

375377
/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
376378
/// and \p Operands with \p VF. This is a convenience wrapper for the
377-
/// type-based getScalarizationOverhead API.
378-
InstructionCost getScalarizationOverhead(Type *ResultTy,
379-
ArrayRef<const VPValue *> Operands,
380-
ElementCount VF);
379+
/// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
380+
/// is true, always compute the cost of scalarizing replicating operands.
381+
InstructionCost
382+
getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
383+
ElementCount VF,
384+
bool AlwaysIncludeReplicatingR = false);
381385
};
382386

383387
/// This class can be used to assign names to VPValues. For VPValues without

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 105 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3069,6 +3069,61 @@ bool VPReplicateRecipe::shouldPack() const {
30693069
});
30703070
}
30713071

3072+
/// Returns true if \p Ptr is a pointer computation for which the legacy cost
3073+
/// model computes a SCEV expression when computing the address cost.
3074+
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
3075+
auto *PtrR = Ptr->getDefiningRecipe();
3076+
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3077+
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
3078+
Instruction::GetElementPtr) ||
3079+
isa<VPWidenGEPRecipe>(PtrR)))
3080+
return false;
3081+
3082+
// We are looking for a GEP where all indices are either loop invariant or
3083+
// inductions.
3084+
for (VPValue *Opd : drop_begin(PtrR->operands())) {
3085+
if (!Opd->isDefinedOutsideLoopRegions() &&
3086+
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
3087+
return false;
3088+
}
3089+
3090+
return true;
3091+
}
3092+
3093+
/// Returns true if \p V is used as part of the address of another load or
3094+
/// store.
3095+
static bool isUsedByLoadStoreAddress(const VPUser *V) {
3096+
SmallPtrSet<const VPUser *, 4> Seen;
3097+
SmallVector<const VPUser *> WorkList = {V};
3098+
3099+
while (!WorkList.empty()) {
3100+
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3101+
if (!Cur || !Seen.insert(Cur).second)
3102+
continue;
3103+
3104+
for (VPUser *U : Cur->users()) {
3105+
if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3106+
if (InterleaveR->getAddr() == Cur)
3107+
return true;
3108+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3109+
if (RepR->getOpcode() == Instruction::Load &&
3110+
RepR->getOperand(0) == Cur)
3111+
return true;
3112+
if (RepR->getOpcode() == Instruction::Store &&
3113+
RepR->getOperand(1) == Cur)
3114+
return true;
3115+
}
3116+
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3117+
if (MemR->getAddr() == Cur && MemR->isConsecutive())
3118+
return true;
3119+
}
3120+
}
3121+
3122+
append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
3123+
}
3124+
return false;
3125+
}
3126+
30723127
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
30733128
VPCostContext &Ctx) const {
30743129
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3176,21 +3231,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31763231
}
31773232
case Instruction::Load:
31783233
case Instruction::Store: {
3179-
if (isSingleScalar()) {
3180-
bool IsLoad = UI->getOpcode() == Instruction::Load;
3181-
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3182-
Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
3183-
const Align Alignment = getLoadStoreAlignment(UI);
3184-
unsigned AS = getLoadStoreAddressSpace(UI);
3185-
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3186-
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3187-
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
3188-
return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3189-
ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
3190-
}
3234+
if (VF.isScalable() && !isSingleScalar())
3235+
return InstructionCost::getInvalid();
3236+
31913237
// TODO: See getMemInstScalarizationCost for how to handle replicating and
31923238
// predicated cases.
3193-
break;
3239+
const VPRegionBlock *ParentRegion = getParent()->getParent();
3240+
if (ParentRegion && ParentRegion->isReplicator())
3241+
break;
3242+
3243+
bool IsLoad = UI->getOpcode() == Instruction::Load;
3244+
const VPValue *PtrOp = getOperand(!IsLoad);
3245+
// TODO: Handle cases where we need to pass a SCEV to
3246+
// getAddressComputationCost.
3247+
if (shouldUseAddressAccessSCEV(PtrOp))
3248+
break;
3249+
3250+
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3251+
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3252+
const Align Alignment = getLoadStoreAlignment(UI);
3253+
unsigned AS = getLoadStoreAddressSpace(UI);
3254+
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3255+
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3256+
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3257+
3258+
Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3259+
3260+
InstructionCost ScalarCost =
3261+
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3262+
PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
3263+
if (isSingleScalar())
3264+
return ScalarCost;
3265+
3266+
SmallVector<const VPValue *> OpsToScalarize;
3267+
Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3268+
// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3269+
// don't assign scalarization overhead in general, if the target prefers
3270+
// vectorized addressing or the loaded value is used as part of an address
3271+
// of another load or store.
3272+
bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3273+
if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
3274+
bool EfficientVectorLoadStore =
3275+
Ctx.TTI.supportsEfficientVectorElementLoadStore();
3276+
if (!(IsLoad && !PreferVectorizedAddressing) &&
3277+
!(!IsLoad && EfficientVectorLoadStore))
3278+
append_range(OpsToScalarize, operands());
3279+
3280+
if (!EfficientVectorLoadStore)
3281+
ResultTy = Ctx.Types.inferScalarType(this);
3282+
}
3283+
3284+
return (ScalarCost * VF.getFixedValue()) +
3285+
Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
31943286
}
31953287
}
31963288

0 commit comments

Comments
 (0)