Skip to content

Commit b2d12d6

Browse files
authored
[VPlan] Extend getSCEVForVPV, use to compute VPReplicateRecipe cost. (#161276)
Update getSCEVExprForVPValue to handle more complex expressions, to use it in VPReplicateRecipe::comptueCost. In particular, it supports construction SCEV expressions for GetElementPtr VPReplicateRecipes, with operands that are VPScalarIVStepsRecipe, VPDerivedIVRecipe and VPCanonicalIVRecipe. If we hit a sub-expression we don't support yet, we return SCEVCouldNotCompute. Note that the SCEV expression is valid VF = 1: we only support construction AddRecs for VPCanonicalIVRecipe, which is an AddRec starting at 0 and stepping by 1. The returned SCEV expressions could be converted to a VF specific one, by rewriting the AddRecs to ones with the appropriate step. Note that the logic for constructing SCEVs for GetElementPtr was directly ported from ScalarEvolution.cpp. Another thing to note is that we construct SCEV expression purely by looking at the operation of the recipe and its translated operands, w/o accessing the underlying IR (the exception being getting the source element type for GEPs). PR: #161276
1 parent c46bfed commit b2d12d6

File tree

5 files changed

+74
-20
lines changed

5 files changed

+74
-20
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3908,7 +3908,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
39083908
continue;
39093909

39103910
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
3911-
*CM.PSE.getSE());
3911+
*CM.PSE.getSE(), OrigLoop);
39123912
precomputeCosts(*Plan, VF, CostCtx);
39133913
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
39143914
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4166,7 +4166,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
41664166
// Add on other costs that are modelled in VPlan, but not in the legacy
41674167
// cost model.
41684168
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
4169-
*CM.PSE.getSE());
4169+
*CM.PSE.getSE(), OrigLoop);
41704170
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
41714171
assert(VectorRegion && "Expected to have a vector region!");
41724172
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6876,7 +6876,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
68766876

68776877
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
68786878
ElementCount VF) const {
6879-
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
6879+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(),
6880+
OrigLoop);
68806881
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
68816882

68826883
// Now compute and add the VPlan-based cost.
@@ -7110,12 +7111,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
71107111
// case, don't trigger the assertion, as the extra simplifications may cause a
71117112
// different VF to be picked by the VPlan-based cost model.
71127113
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
7113-
*CM.PSE.getSE());
7114+
*CM.PSE.getSE(), OrigLoop);
71147115
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
71157116
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
71167117
// with early exits and plans with additional VPlan simplifications. The
71177118
// legacy cost model doesn't properly model costs for such loops.
71187119
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7120+
!Legal->getLAI()->getSymbolicStrides().empty() ||
71197121
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
71207122
CostCtx, OrigLoop,
71217123
BestFactor.Width) ||
@@ -8441,7 +8443,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
84418443
// and mulacc-reduction are implemented.
84428444
if (!CM.foldTailWithEVL()) {
84438445
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
8444-
*CM.PSE.getSE());
8446+
*CM.PSE.getSE(), OrigLoop);
84458447
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
84468448
CostCtx, Range);
84478449
}
@@ -9911,7 +9913,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
99119913
bool ForceVectorization =
99129914
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
99139915
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
9914-
CM.CostKind, *CM.PSE.getSE());
9916+
CM.CostKind, *CM.PSE.getSE(), L);
99159917
if (!ForceVectorization &&
99169918
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
99179919
LVP.getPlanFor(VF.Width), SEL,

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -350,13 +350,14 @@ struct VPCostContext {
350350
SmallPtrSet<Instruction *, 8> SkipCostComputation;
351351
TargetTransformInfo::TargetCostKind CostKind;
352352
ScalarEvolution &SE;
353+
const Loop *L;
353354

354355
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
355356
const VPlan &Plan, LoopVectorizationCostModel &CM,
356357
TargetTransformInfo::TargetCostKind CostKind,
357-
ScalarEvolution &SE)
358+
ScalarEvolution &SE, const Loop *L)
358359
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
359-
CostKind(CostKind), SE(SE) {}
360+
CostKind(CostKind), SE(SE), L(L) {}
360361

361362
/// Return the cost for \p UI with \p VF using the legacy cost model as
362363
/// fallback until computing the cost of all recipes migrates to VPlan.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3167,26 +3167,30 @@ bool VPReplicateRecipe::shouldPack() const {
31673167
});
31683168
}
31693169

3170-
/// Returns true if \p Ptr is a pointer computation for which the legacy cost
3171-
/// model computes a SCEV expression when computing the address cost.
3172-
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
3170+
/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3171+
/// which the legacy cost model computes a SCEV expression when computing the
3172+
/// address cost. Computing SCEVs for VPValues is incomplete and returns
3173+
/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3174+
/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3175+
static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
3176+
const Loop *L) {
31733177
auto *PtrR = Ptr->getDefiningRecipe();
31743178
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
31753179
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
31763180
Instruction::GetElementPtr) ||
31773181
isa<VPWidenGEPRecipe>(PtrR) ||
31783182
match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
3179-
return false;
3183+
return nullptr;
31803184

31813185
// We are looking for a GEP where all indices are either loop invariant or
31823186
// inductions.
31833187
for (VPValue *Opd : drop_begin(PtrR->operands())) {
31843188
if (!Opd->isDefinedOutsideLoopRegions() &&
31853189
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
3186-
return false;
3190+
return nullptr;
31873191
}
31883192

3189-
return true;
3193+
return vputils::getSCEVExprForVPValue(Ptr, SE, L);
31903194
}
31913195

31923196
/// Returns true if \p V is used as part of the address of another load or
@@ -3354,9 +3358,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
33543358

33553359
bool IsLoad = UI->getOpcode() == Instruction::Load;
33563360
const VPValue *PtrOp = getOperand(!IsLoad);
3357-
// TODO: Handle cases where we need to pass a SCEV to
3358-
// getAddressComputationCost.
3359-
if (shouldUseAddressAccessSCEV(PtrOp))
3361+
const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L);
3362+
if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))
33603363
break;
33613364

33623365
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
@@ -3374,7 +3377,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
33743377
InstructionCost ScalarCost =
33753378
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
33763379
PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
3377-
nullptr, Ctx.CostKind);
3380+
PtrSCEV, Ctx.CostKind);
33783381
if (isSingleScalar())
33793382
return ScalarCost;
33803383

llvm/lib/Transforms/Vectorize/VPlanUtils.cpp

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
7575
B == Plan.getBackedgeTakenCount();
7676
}
7777

78-
const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
78+
const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
79+
ScalarEvolution &SE, const Loop *L) {
7980
if (V->isLiveIn()) {
8081
if (Value *LiveIn = V->getLiveInIRValue())
8182
return SE.getSCEV(LiveIn);
@@ -86,6 +87,52 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
8687
return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe())
8788
.Case<VPExpandSCEVRecipe>(
8889
[](const VPExpandSCEVRecipe *R) { return R->getSCEV(); })
90+
.Case<VPCanonicalIVPHIRecipe>([&SE, L](const VPCanonicalIVPHIRecipe *R) {
91+
if (!L)
92+
return SE.getCouldNotCompute();
93+
const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
94+
return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L,
95+
SCEV::FlagAnyWrap);
96+
})
97+
.Case<VPDerivedIVRecipe>([&SE, L](const VPDerivedIVRecipe *R) {
98+
const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
99+
const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L);
100+
const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L);
101+
if (any_of(ArrayRef({Start, IV, Scale}), IsaPred<SCEVCouldNotCompute>))
102+
return SE.getCouldNotCompute();
103+
104+
return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()),
105+
SE.getMulExpr(IV, SE.getTruncateOrSignExtend(
106+
Scale, IV->getType())));
107+
})
108+
.Case<VPScalarIVStepsRecipe>([&SE, L](const VPScalarIVStepsRecipe *R) {
109+
const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L);
110+
const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L);
111+
if (isa<SCEVCouldNotCompute>(IV) || isa<SCEVCouldNotCompute>(Step))
112+
return SE.getCouldNotCompute();
113+
return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()),
114+
Step);
115+
})
116+
.Case<VPReplicateRecipe>([&SE, L](const VPReplicateRecipe *R) {
117+
if (R->getOpcode() != Instruction::GetElementPtr)
118+
return SE.getCouldNotCompute();
119+
120+
const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L);
121+
if (isa<SCEVCouldNotCompute>(Base))
122+
return SE.getCouldNotCompute();
123+
124+
SmallVector<const SCEV *> IndexExprs;
125+
for (VPValue *Index : drop_begin(R->operands())) {
126+
const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L);
127+
if (isa<SCEVCouldNotCompute>(IndexExpr))
128+
return SE.getCouldNotCompute();
129+
IndexExprs.push_back(IndexExpr);
130+
}
131+
132+
Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr())
133+
->getSourceElementType();
134+
return SE.getGEPExpr(Base, IndexExprs, SrcElementTy);
135+
})
89136
.Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });
90137
}
91138

llvm/lib/Transforms/Vectorize/VPlanUtils.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr);
3737

3838
/// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no
3939
/// SCEV expression could be constructed.
40-
const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
40+
const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE,
41+
const Loop *L = nullptr);
4142

4243
/// Returns true if \p VPV is a single scalar, either because it produces the
4344
/// same value for all lanes or only has its first lane used.

0 commit comments

Comments
 (0)