Skip to content

Commit f61be43

Browse files
committed
Revert "[VPlan] Compute cost of more replicating loads/stores in ::computeCost. (#160053)"
This reverts commit b4be7ec. See #161404 for a crash exposed by the change. Revert while I investigate.
1 parent a099c91 commit f61be43

File tree

4 files changed

+27
-130
lines changed

4 files changed

+27
-130
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3903,8 +3903,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
39033903
if (VF.isScalar())
39043904
continue;
39053905

3906-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
3907-
*CM.PSE.getSE());
3906+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
39083907
precomputeCosts(*Plan, VF, CostCtx);
39093908
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
39103909
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4161,8 +4160,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
41614160

41624161
// Add on other costs that are modelled in VPlan, but not in the legacy
41634162
// cost model.
4164-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
4165-
*CM.PSE.getSE());
4163+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
41664164
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
41674165
assert(VectorRegion && "Expected to have a vector region!");
41684166
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6837,7 +6835,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
68376835

68386836
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
68396837
ElementCount VF) const {
6840-
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
6838+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
68416839
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
68426840

68436841
// Now compute and add the VPlan-based cost.
@@ -7070,8 +7068,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
70707068
// simplifications not accounted for in the legacy cost model. If that's the
70717069
// case, don't trigger the assertion, as the extra simplifications may cause a
70727070
// different VF to be picked by the VPlan-based cost model.
7073-
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
7074-
*CM.PSE.getSE());
7071+
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
70757072
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
70767073
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
70777074
// with early exits and plans with additional VPlan simplifications. The
@@ -8600,8 +8597,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
86008597
// TODO: Enable following transform when the EVL-version of extended-reduction
86018598
// and mulacc-reduction are implemented.
86028599
if (!CM.foldTailWithEVL()) {
8603-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
8604-
*CM.PSE.getSE());
8600+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
86058601
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
86068602
CostCtx, Range);
86078603
}
@@ -10058,7 +10054,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1005810054
bool ForceVectorization =
1005910055
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1006010056
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
10061-
CM.CostKind, *CM.PSE.getSE());
10057+
CM.CostKind);
1006210058
if (!ForceVectorization &&
1006310059
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
1006410060
LVP.getPlanFor(VF.Width), SEL,

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1750,8 +1750,7 @@ VPCostContext::getOperandInfo(VPValue *V) const {
17501750
}
17511751

17521752
InstructionCost VPCostContext::getScalarizationOverhead(
1753-
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
1754-
bool AlwaysIncludeReplicatingR) {
1753+
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
17551754
if (VF.isScalar())
17561755
return 0;
17571756

@@ -1771,9 +1770,7 @@ InstructionCost VPCostContext::getScalarizationOverhead(
17711770
SmallPtrSet<const VPValue *, 4> UniqueOperands;
17721771
SmallVector<Type *> Tys;
17731772
for (auto *Op : Operands) {
1774-
if (Op->isLiveIn() ||
1775-
(!AlwaysIncludeReplicatingR &&
1776-
isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
1773+
if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
17771774
!UniqueOperands.insert(Op).second)
17781775
continue;
17791776
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -349,14 +349,12 @@ struct VPCostContext {
349349
LoopVectorizationCostModel &CM;
350350
SmallPtrSet<Instruction *, 8> SkipCostComputation;
351351
TargetTransformInfo::TargetCostKind CostKind;
352-
ScalarEvolution &SE;
353352

354353
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
355354
const VPlan &Plan, LoopVectorizationCostModel &CM,
356-
TargetTransformInfo::TargetCostKind CostKind,
357-
ScalarEvolution &SE)
355+
TargetTransformInfo::TargetCostKind CostKind)
358356
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
359-
CostKind(CostKind), SE(SE) {}
357+
CostKind(CostKind) {}
360358

361359
/// Return the cost for \p UI with \p VF using the legacy cost model as
362360
/// fallback until computing the cost of all recipes migrates to VPlan.
@@ -376,12 +374,10 @@ struct VPCostContext {
376374

377375
/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
378376
/// and \p Operands with \p VF. This is a convenience wrapper for the
379-
/// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
380-
/// is true, always compute the cost of scalarizing replicating operands.
381-
InstructionCost
382-
getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
383-
ElementCount VF,
384-
bool AlwaysIncludeReplicatingR = false);
377+
/// type-based getScalarizationOverhead API.
378+
InstructionCost getScalarizationOverhead(Type *ResultTy,
379+
ArrayRef<const VPValue *> Operands,
380+
ElementCount VF);
385381
};
386382

387383
/// This class can be used to assign names to VPValues. For VPValues without

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 13 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -3098,61 +3098,6 @@ bool VPReplicateRecipe::shouldPack() const {
30983098
});
30993099
}
31003100

3101-
/// Returns true if \p Ptr is a pointer computation for which the legacy cost
3102-
/// model computes a SCEV expression when computing the address cost.
3103-
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
3104-
auto *PtrR = Ptr->getDefiningRecipe();
3105-
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3106-
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
3107-
Instruction::GetElementPtr) ||
3108-
isa<VPWidenGEPRecipe>(PtrR)))
3109-
return false;
3110-
3111-
// We are looking for a GEP where all indices are either loop invariant or
3112-
// inductions.
3113-
for (VPValue *Opd : drop_begin(PtrR->operands())) {
3114-
if (!Opd->isDefinedOutsideLoopRegions() &&
3115-
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
3116-
return false;
3117-
}
3118-
3119-
return true;
3120-
}
3121-
3122-
/// Returns true if \p V is used as part of the address of another load or
3123-
/// store.
3124-
static bool isUsedByLoadStoreAddress(const VPUser *V) {
3125-
SmallPtrSet<const VPUser *, 4> Seen;
3126-
SmallVector<const VPUser *> WorkList = {V};
3127-
3128-
while (!WorkList.empty()) {
3129-
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3130-
if (!Cur || !Seen.insert(Cur).second)
3131-
continue;
3132-
3133-
for (VPUser *U : Cur->users()) {
3134-
if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3135-
if (InterleaveR->getAddr() == Cur)
3136-
return true;
3137-
if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3138-
if (RepR->getOpcode() == Instruction::Load &&
3139-
RepR->getOperand(0) == Cur)
3140-
return true;
3141-
if (RepR->getOpcode() == Instruction::Store &&
3142-
RepR->getOperand(1) == Cur)
3143-
return true;
3144-
}
3145-
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3146-
if (MemR->getAddr() == Cur && MemR->isConsecutive())
3147-
return true;
3148-
}
3149-
}
3150-
3151-
append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
3152-
}
3153-
return false;
3154-
}
3155-
31563101
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31573102
VPCostContext &Ctx) const {
31583103
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3260,58 +3205,21 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
32603205
}
32613206
case Instruction::Load:
32623207
case Instruction::Store: {
3263-
if (VF.isScalable() && !isSingleScalar())
3264-
return InstructionCost::getInvalid();
3265-
3208+
if (isSingleScalar()) {
3209+
bool IsLoad = UI->getOpcode() == Instruction::Load;
3210+
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3211+
Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
3212+
const Align Alignment = getLoadStoreAlignment(UI);
3213+
unsigned AS = getLoadStoreAddressSpace(UI);
3214+
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3215+
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3216+
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
3217+
return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3218+
ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
3219+
}
32663220
// TODO: See getMemInstScalarizationCost for how to handle replicating and
32673221
// predicated cases.
3268-
const VPRegionBlock *ParentRegion = getParent()->getParent();
3269-
if (ParentRegion && ParentRegion->isReplicator())
3270-
break;
3271-
3272-
bool IsLoad = UI->getOpcode() == Instruction::Load;
3273-
const VPValue *PtrOp = getOperand(!IsLoad);
3274-
// TODO: Handle cases where we need to pass a SCEV to
3275-
// getAddressComputationCost.
3276-
if (shouldUseAddressAccessSCEV(PtrOp))
3277-
break;
3278-
3279-
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3280-
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3281-
const Align Alignment = getLoadStoreAlignment(UI);
3282-
unsigned AS = getLoadStoreAddressSpace(UI);
3283-
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3284-
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3285-
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3286-
3287-
Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3288-
3289-
InstructionCost ScalarCost =
3290-
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3291-
PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
3292-
if (isSingleScalar())
3293-
return ScalarCost;
3294-
3295-
SmallVector<const VPValue *> OpsToScalarize;
3296-
Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3297-
// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3298-
// don't assign scalarization overhead in general, if the target prefers
3299-
// vectorized addressing or the loaded value is used as part of an address
3300-
// of another load or store.
3301-
bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3302-
if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
3303-
bool EfficientVectorLoadStore =
3304-
Ctx.TTI.supportsEfficientVectorElementLoadStore();
3305-
if (!(IsLoad && !PreferVectorizedAddressing) &&
3306-
!(!IsLoad && EfficientVectorLoadStore))
3307-
append_range(OpsToScalarize, operands());
3308-
3309-
if (!EfficientVectorLoadStore)
3310-
ResultTy = Ctx.Types.inferScalarType(this);
3311-
}
3312-
3313-
return (ScalarCost * VF.getFixedValue()) +
3314-
Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
3222+
break;
33153223
}
33163224
}
33173225

0 commit comments

Comments
 (0)