Skip to content

Commit 721c122

Browse files
committed
[VPlan] Compute cost of more replicating loads/stores in ::computeCost.
Update VPReplicateRecipe::computeCost to compute costs of more replicating loads/stores. There are 2 cases that require extra checks to match the legacy cost model: 1. If the pointer is based on an induction, the legacy cost model passes its SCEV to getAddressComputationCost. In those cases, still fall back to the legacy cost. SCEV computations will be added as follow-up 2. If a load is used as part of an address of another load, the legacy cost model skips the scalarization overhead. Those cases are currently handled by a usedByLoadOrStore helper. Note that getScalarizationOverhead also needs updating, because when the legacy cost model computes the scalarization overhead, scalars have not been collected yet, so we can't each for replicating recipes to skip their cost, except other loads. This again can be further improved by modeling inserts/extracts explicitly and consistently, and compute costs for those operations directly where needed.
1 parent add9079 commit 721c122

File tree

4 files changed

+131
-28
lines changed

4 files changed

+131
-28
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3925,7 +3925,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
39253925
if (VF.isScalar())
39263926
continue;
39273927

3928-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
3928+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
3929+
*CM.PSE.getSE());
39293930
precomputeCosts(*Plan, VF, CostCtx);
39303931
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
39313932
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4182,7 +4183,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
41824183

41834184
// Add on other costs that are modelled in VPlan, but not in the legacy
41844185
// cost model.
4185-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
4186+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
4187+
*CM.PSE.getSE());
41864188
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
41874189
assert(VectorRegion && "Expected to have a vector region!");
41884190
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6871,7 +6873,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
68716873

68726874
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
68736875
ElementCount VF) const {
6874-
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
6876+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
68756877
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
68766878

68776879
// Now compute and add the VPlan-based cost.
@@ -7082,7 +7084,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
70827084
// simplifications not accounted for in the legacy cost model. If that's the
70837085
// case, don't trigger the assertion, as the extra simplifications may cause a
70847086
// different VF to be picked by the VPlan-based cost model.
7085-
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
7087+
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
7088+
*CM.PSE.getSE());
70867089
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
70877090
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
70887091
// with early exits and plans with additional VPlan simplifications. The
@@ -8704,7 +8707,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
87048707
// TODO: Enable following transform when the EVL-version of extended-reduction
87058708
// and mulacc-reduction are implemented.
87068709
if (!CM.foldTailWithEVL()) {
8707-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
8710+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
8711+
*CM.PSE.getSE());
87088712
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
87098713
CostCtx, Range);
87108714
}
@@ -10043,7 +10047,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1004310047
bool ForceVectorization =
1004410048
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1004510049
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
10046-
CM.CostKind);
10050+
CM.CostKind, *CM.PSE.getSE());
1004710051
if (!ForceVectorization &&
1004810052
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
1004910053
LVP.getPlanFor(VF.Width), SEL,

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1778,8 +1778,10 @@ VPCostContext::getOperandInfo(VPValue *V) const {
17781778
return TTI::getOperandInfo(V->getLiveInIRValue());
17791779
}
17801780

1781-
InstructionCost VPCostContext::getScalarizationOverhead(
1782-
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
1781+
InstructionCost
1782+
VPCostContext::getScalarizationOverhead(Type *ResultTy,
1783+
ArrayRef<const VPValue *> Operands,
1784+
ElementCount VF, bool Skip) {
17831785
if (VF.isScalar())
17841786
return 0;
17851787

@@ -1799,7 +1801,10 @@ InstructionCost VPCostContext::getScalarizationOverhead(
17991801
SmallPtrSet<const VPValue *, 4> UniqueOperands;
18001802
SmallVector<Type *> Tys;
18011803
for (auto *Op : Operands) {
1802-
if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
1804+
if (Op->isLiveIn() ||
1805+
(!Skip && isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
1806+
(isa<VPReplicateRecipe>(Op) &&
1807+
cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
18031808
!UniqueOperands.insert(Op).second)
18041809
continue;
18051810
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -349,12 +349,14 @@ struct VPCostContext {
349349
LoopVectorizationCostModel &CM;
350350
SmallPtrSet<Instruction *, 8> SkipCostComputation;
351351
TargetTransformInfo::TargetCostKind CostKind;
352+
ScalarEvolution &SE;
352353

353354
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
354355
const VPlan &Plan, LoopVectorizationCostModel &CM,
355-
TargetTransformInfo::TargetCostKind CostKind)
356+
TargetTransformInfo::TargetCostKind CostKind,
357+
ScalarEvolution &SE)
356358
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
357-
CostKind(CostKind) {}
359+
CostKind(CostKind), SE(SE) {}
358360

359361
/// Return the cost for \p UI with \p VF using the legacy cost model as
360362
/// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
374376

375377
/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
376378
/// and \p Operands with \p VF. This is a convenience wrapper for the
377-
/// type-based getScalarizationOverhead API.
378-
InstructionCost getScalarizationOverhead(Type *ResultTy,
379-
ArrayRef<const VPValue *> Operands,
380-
ElementCount VF);
379+
/// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
380+
/// is true, always compute the cost of scalarizing replicating operands.
381+
InstructionCost
382+
getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
383+
ElementCount VF,
384+
bool AlwaysIncludeReplicatingR = false);
381385
};
382386

383387
/// This class can be used to assign names to VPValues. For VPValues without

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 103 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3075,6 +3075,63 @@ bool VPReplicateRecipe::shouldPack() const {
30753075
});
30763076
}
30773077

3078+
/// Returns true if \p Ptr is a pointer computation for which the legacy cost
3079+
/// model computes a SCEV expression when comping the address cost.
3080+
static bool shouldUseAddressAccessSCEV(VPValue *Ptr) {
3081+
auto *PtrR = Ptr->getDefiningRecipe();
3082+
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3083+
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
3084+
Instruction::GetElementPtr) ||
3085+
isa<VPWidenGEPRecipe>(PtrR)))
3086+
return false;
3087+
3088+
// We are looking for a gep with all loop invariant indices except for one
3089+
// which should be an induction variable.
3090+
unsigned NumOperands = PtrR->getNumOperands();
3091+
for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
3092+
VPValue *Opd = PtrR->getOperand(Idx);
3093+
if (!(Opd->isDefinedOutsideLoopRegions()) &&
3094+
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
3095+
return false;
3096+
}
3097+
3098+
return true;
3099+
}
3100+
3101+
/// Returns true of \p V is used as part of the address of another load or
3102+
/// store.
3103+
static bool isUsedByLoadStoreAddress(const VPUser *V) {
3104+
SmallPtrSet<const VPUser *, 4> Seen;
3105+
SmallVector<const VPUser *> WorkList = {V};
3106+
3107+
while (!WorkList.empty()) {
3108+
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3109+
if (!Cur || !Seen.insert(Cur).second)
3110+
continue;
3111+
3112+
for (VPUser *U : Cur->users()) {
3113+
if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(U))
3114+
if (InterleaveR->getAddr() == Cur)
3115+
return true;
3116+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3117+
if (RepR->getOpcode() == Instruction::Load &&
3118+
RepR->getOperand(0) == Cur)
3119+
return true;
3120+
if (RepR->getOpcode() == Instruction::Store &&
3121+
RepR->getOperand(1) == Cur)
3122+
return true;
3123+
}
3124+
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3125+
if (MemR->getAddr() == Cur && MemR->isConsecutive())
3126+
return true;
3127+
}
3128+
}
3129+
3130+
append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
3131+
}
3132+
return false;
3133+
}
3134+
30783135
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
30793136
VPCostContext &Ctx) const {
30803137
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3182,21 +3239,54 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31823239
}
31833240
case Instruction::Load:
31843241
case Instruction::Store: {
3185-
if (isSingleScalar()) {
3186-
bool IsLoad = UI->getOpcode() == Instruction::Load;
3187-
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3188-
Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
3189-
const Align Alignment = getLoadStoreAlignment(UI);
3190-
unsigned AS = getLoadStoreAddressSpace(UI);
3191-
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3192-
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3193-
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
3194-
return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3195-
ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
3196-
}
3242+
if (VF.isScalable() && !isSingleScalar())
3243+
return InstructionCost::getInvalid();
3244+
31973245
// TODO: See getMemInstScalarizationCost for how to handle replicating and
31983246
// predicated cases.
3199-
break;
3247+
if (getParent()->getParent() && getParent()->getParent()->isReplicator())
3248+
break;
3249+
3250+
bool IsLoad = UI->getOpcode() == Instruction::Load;
3251+
// TODO: Handle cases where we need to pass a SCEV to
3252+
// getAddressComputationCost.
3253+
if (shouldUseAddressAccessSCEV(getOperand(!IsLoad)))
3254+
break;
3255+
3256+
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3257+
Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
3258+
const Align Alignment = getLoadStoreAlignment(UI);
3259+
unsigned AS = getLoadStoreAddressSpace(UI);
3260+
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3261+
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3262+
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3263+
3264+
Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3265+
3266+
InstructionCost ScalarCost =
3267+
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3268+
PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
3269+
if (isSingleScalar())
3270+
return ScalarCost;
3271+
3272+
SmallVector<const VPValue *> OpsToScalarize;
3273+
Type *ResultTy = Type::getVoidTy(getParent()->getPlan()->getContext());
3274+
// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3275+
// don't assign scalarization overhead in general, if the target prefers
3276+
// vectorized addressing or the loaded value is used as part of an address
3277+
// of another load or store.
3278+
if (Ctx.TTI.prefersVectorizedAddressing() ||
3279+
!isUsedByLoadStoreAddress(this)) {
3280+
if (!(IsLoad && !Ctx.TTI.prefersVectorizedAddressing()) &&
3281+
!(!IsLoad && Ctx.TTI.supportsEfficientVectorElementLoadStore()))
3282+
append_range(OpsToScalarize, operands());
3283+
3284+
if (!Ctx.TTI.supportsEfficientVectorElementLoadStore())
3285+
ResultTy = Ctx.Types.inferScalarType(this);
3286+
}
3287+
3288+
return (ScalarCost * VF.getFixedValue()) +
3289+
Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
32003290
}
32013291
}
32023292

0 commit comments

Comments
 (0)