Skip to content

Commit 74af578

Browse files
authored
Reapply "[VPlan] Compute cost of more replicating loads/stores in ::computeCost. (#160053)" (#162157)
This reverts commit f80c0ba and 94eade6. Recommit a small fix for targets using prefersVectorizedAddressing. Original message: Update VPReplicateRecipe::computeCost to compute costs of more replicating loads/stores. There are 2 cases that require extra checks to match the legacy cost model: 1. If the pointer is based on an induction, the legacy cost model passes its SCEV to getAddressComputationCost. In those cases, still fall back to the legacy cost. SCEV computations will be added as follow-up 2. If a load is used as part of an address of another load, the legacy cost model skips the scalarization overhead. Those cases are currently handled by a usedByLoadOrStore helper. Note that getScalarizationOverhead also needs updating, because when the legacy cost model computes the scalarization overhead, scalars have not been collected yet, so we can't each for replicating recipes to skip their cost, except other loads. This again can be further improved by modeling inserts/extracts explicitly and consistently, and compute costs for those operations directly where needed. PR: #160053
1 parent c830c84 commit 74af578

File tree

6 files changed

+346
-29
lines changed

6 files changed

+346
-29
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3903,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
39033903
if (VF.isScalar())
39043904
continue;
39053905

3906-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
3906+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
3907+
*CM.PSE.getSE());
39073908
precomputeCosts(*Plan, VF, CostCtx);
39083909
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
39093910
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4160,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
41604161

41614162
// Add on other costs that are modelled in VPlan, but not in the legacy
41624163
// cost model.
4163-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
4164+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
4165+
*CM.PSE.getSE());
41644166
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
41654167
assert(VectorRegion && "Expected to have a vector region!");
41664168
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6852,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
68526854

68536855
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
68546856
ElementCount VF) const {
6855-
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
6857+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
68566858
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
68576859

68586860
// Now compute and add the VPlan-based cost.
@@ -7085,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
70857087
// simplifications not accounted for in the legacy cost model. If that's the
70867088
// case, don't trigger the assertion, as the extra simplifications may cause a
70877089
// different VF to be picked by the VPlan-based cost model.
7088-
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
7090+
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
7091+
*CM.PSE.getSE());
70897092
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
70907093
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
70917094
// with early exits and plans with additional VPlan simplifications. The
@@ -8418,7 +8421,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
84188421
// TODO: Enable following transform when the EVL-version of extended-reduction
84198422
// and mulacc-reduction are implemented.
84208423
if (!CM.foldTailWithEVL()) {
8421-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
8424+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
8425+
*CM.PSE.getSE());
84228426
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
84238427
CostCtx, Range);
84248428
}
@@ -9874,7 +9878,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
98749878
bool ForceVectorization =
98759879
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
98769880
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
9877-
CM.CostKind);
9881+
CM.CostKind, *CM.PSE.getSE());
98789882
if (!ForceVectorization &&
98799883
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
98809884
LVP.getPlanFor(VF.Width), SEL,

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1772,7 +1772,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
17721772
}
17731773

17741774
InstructionCost VPCostContext::getScalarizationOverhead(
1775-
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
1775+
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
1776+
bool AlwaysIncludeReplicatingR) {
17761777
if (VF.isScalar())
17771778
return 0;
17781779

@@ -1792,7 +1793,11 @@ InstructionCost VPCostContext::getScalarizationOverhead(
17921793
SmallPtrSet<const VPValue *, 4> UniqueOperands;
17931794
SmallVector<Type *> Tys;
17941795
for (auto *Op : Operands) {
1795-
if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
1796+
if (Op->isLiveIn() ||
1797+
(!AlwaysIncludeReplicatingR &&
1798+
isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
1799+
(isa<VPReplicateRecipe>(Op) &&
1800+
cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
17961801
!UniqueOperands.insert(Op).second)
17971802
continue;
17981803
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -349,12 +349,14 @@ struct VPCostContext {
349349
LoopVectorizationCostModel &CM;
350350
SmallPtrSet<Instruction *, 8> SkipCostComputation;
351351
TargetTransformInfo::TargetCostKind CostKind;
352+
ScalarEvolution &SE;
352353

353354
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
354355
const VPlan &Plan, LoopVectorizationCostModel &CM,
355-
TargetTransformInfo::TargetCostKind CostKind)
356+
TargetTransformInfo::TargetCostKind CostKind,
357+
ScalarEvolution &SE)
356358
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
357-
CostKind(CostKind) {}
359+
CostKind(CostKind), SE(SE) {}
358360

359361
/// Return the cost for \p UI with \p VF using the legacy cost model as
360362
/// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
374376

375377
/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
376378
/// and \p Operands with \p VF. This is a convenience wrapper for the
377-
/// type-based getScalarizationOverhead API.
378-
InstructionCost getScalarizationOverhead(Type *ResultTy,
379-
ArrayRef<const VPValue *> Operands,
380-
ElementCount VF);
379+
/// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
380+
/// is true, always compute the cost of scalarizing replicating operands.
381+
InstructionCost
382+
getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
383+
ElementCount VF,
384+
bool AlwaysIncludeReplicatingR = false);
381385
};
382386

383387
/// This class can be used to assign names to VPValues. For VPValues without

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 109 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include <cassert>
4141

4242
using namespace llvm;
43+
using namespace llvm::VPlanPatternMatch;
4344

4445
using VectorParts = SmallVector<Value *, 2>;
4546

@@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
303304
VPRecipeBase *OpR = Op->getDefiningRecipe();
304305

305306
// If the partial reduction is predicated, a select will be operand 0
306-
using namespace llvm::VPlanPatternMatch;
307307
if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
308308
OpR = Op->getDefiningRecipe();
309309
}
@@ -1963,7 +1963,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
19631963
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
19641964

19651965
VPValue *Op0, *Op1;
1966-
using namespace llvm::VPlanPatternMatch;
19671966
if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
19681967
(match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
19691968
match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
@@ -3111,6 +3110,62 @@ bool VPReplicateRecipe::shouldPack() const {
31113110
});
31123111
}
31133112

3113+
/// Returns true if \p Ptr is a pointer computation for which the legacy cost
3114+
/// model computes a SCEV expression when computing the address cost.
3115+
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
3116+
auto *PtrR = Ptr->getDefiningRecipe();
3117+
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3118+
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
3119+
Instruction::GetElementPtr) ||
3120+
isa<VPWidenGEPRecipe>(PtrR) ||
3121+
match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
3122+
return false;
3123+
3124+
// We are looking for a GEP where all indices are either loop invariant or
3125+
// inductions.
3126+
for (VPValue *Opd : drop_begin(PtrR->operands())) {
3127+
if (!Opd->isDefinedOutsideLoopRegions() &&
3128+
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
3129+
return false;
3130+
}
3131+
3132+
return true;
3133+
}
3134+
3135+
/// Returns true if \p V is used as part of the address of another load or
3136+
/// store.
3137+
static bool isUsedByLoadStoreAddress(const VPUser *V) {
3138+
SmallPtrSet<const VPUser *, 4> Seen;
3139+
SmallVector<const VPUser *> WorkList = {V};
3140+
3141+
while (!WorkList.empty()) {
3142+
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3143+
if (!Cur || !Seen.insert(Cur).second)
3144+
continue;
3145+
3146+
for (VPUser *U : Cur->users()) {
3147+
if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3148+
if (InterleaveR->getAddr() == Cur)
3149+
return true;
3150+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3151+
if (RepR->getOpcode() == Instruction::Load &&
3152+
RepR->getOperand(0) == Cur)
3153+
return true;
3154+
if (RepR->getOpcode() == Instruction::Store &&
3155+
RepR->getOperand(1) == Cur)
3156+
return true;
3157+
}
3158+
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3159+
if (MemR->getAddr() == Cur && MemR->isConsecutive())
3160+
return true;
3161+
}
3162+
}
3163+
3164+
append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
3165+
}
3166+
return false;
3167+
}
3168+
31143169
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31153170
VPCostContext &Ctx) const {
31163171
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3218,21 +3273,60 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
32183273
}
32193274
case Instruction::Load:
32203275
case Instruction::Store: {
3221-
if (isSingleScalar()) {
3222-
bool IsLoad = UI->getOpcode() == Instruction::Load;
3223-
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3224-
Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
3225-
const Align Alignment = getLoadStoreAlignment(UI);
3226-
unsigned AS = getLoadStoreAddressSpace(UI);
3227-
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3228-
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3229-
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
3230-
return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3231-
ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
3232-
}
3276+
if (VF.isScalable() && !isSingleScalar())
3277+
return InstructionCost::getInvalid();
3278+
32333279
// TODO: See getMemInstScalarizationCost for how to handle replicating and
32343280
// predicated cases.
3235-
break;
3281+
const VPRegionBlock *ParentRegion = getParent()->getParent();
3282+
if (ParentRegion && ParentRegion->isReplicator())
3283+
break;
3284+
3285+
bool IsLoad = UI->getOpcode() == Instruction::Load;
3286+
const VPValue *PtrOp = getOperand(!IsLoad);
3287+
// TODO: Handle cases where we need to pass a SCEV to
3288+
// getAddressComputationCost.
3289+
if (shouldUseAddressAccessSCEV(PtrOp))
3290+
break;
3291+
3292+
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3293+
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3294+
const Align Alignment = getLoadStoreAlignment(UI);
3295+
unsigned AS = getLoadStoreAddressSpace(UI);
3296+
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3297+
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3298+
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3299+
3300+
Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3301+
bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3302+
bool UsedByLoadStoreAddress =
3303+
!PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
3304+
InstructionCost ScalarCost =
3305+
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3306+
PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
3307+
nullptr, Ctx.CostKind);
3308+
if (isSingleScalar())
3309+
return ScalarCost;
3310+
3311+
SmallVector<const VPValue *> OpsToScalarize;
3312+
Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3313+
// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3314+
// don't assign scalarization overhead in general, if the target prefers
3315+
// vectorized addressing or the loaded value is used as part of an address
3316+
// of another load or store.
3317+
if (!UsedByLoadStoreAddress) {
3318+
bool EfficientVectorLoadStore =
3319+
Ctx.TTI.supportsEfficientVectorElementLoadStore();
3320+
if (!(IsLoad && !PreferVectorizedAddressing) &&
3321+
!(!IsLoad && EfficientVectorLoadStore))
3322+
append_range(OpsToScalarize, operands());
3323+
3324+
if (!EfficientVectorLoadStore)
3325+
ResultTy = Ctx.Types.inferScalarType(this);
3326+
}
3327+
3328+
return (ScalarCost * VF.getFixedValue()) +
3329+
Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
32363330
}
32373331
}
32383332

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt -p loop-vectorize -S %s | FileCheck %s
3+
4+
target triple = "armv7-unknown-linux-gnueabihf"
5+
6+
define void @replicating_load_used_by_other_load(i32 %arg, ptr %a, i32 %b) {
7+
; CHECK-LABEL: define void @replicating_load_used_by_other_load(
8+
; CHECK-SAME: i32 [[ARG:%.*]], ptr [[A:%.*]], i32 [[B:%.*]]) {
9+
; CHECK-NEXT: [[ENTRY:.*]]:
10+
; CHECK-NEXT: br label %[[LOOP:.*]]
11+
; CHECK: [[LOOP]]:
12+
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[ARG]], %[[ENTRY]] ]
13+
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[IV]], 1
14+
; CHECK-NEXT: [[AND_1:%.*]] = and i32 [[IV]], 1
15+
; CHECK-NEXT: [[SHL_1:%.*]] = shl i32 [[IV]], 2
16+
; CHECK-NEXT: [[SHL_2:%.*]] = shl i32 [[IV]], 1
17+
; CHECK-NEXT: [[AND_2:%.*]] = and i32 [[SHL_2]], 2
18+
; CHECK-NEXT: [[OR_1:%.*]] = or i32 [[AND_2]], [[AND_1]]
19+
; CHECK-NEXT: [[OR_2:%.*]] = or i32 [[OR_1]], [[SHL_1]]
20+
; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[B]], [[OR_2]]
21+
; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[XOR_1]], [[ARG]]
22+
; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[SHL_1]], 1
23+
; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[SHR]], [[ARG]]
24+
; CHECK-NEXT: [[AND_3:%.*]] = and i32 [[XOR_3]], 1
25+
; CHECK-NEXT: [[AND_4:%.*]] = and i32 [[IV]], 2147483646
26+
; CHECK-NEXT: [[OR_3:%.*]] = or i32 [[AND_3]], [[AND_4]]
27+
; CHECK-NEXT: [[AND_5:%.*]] = and i32 [[IV]], 254
28+
; CHECK-NEXT: [[SHL_3:%.*]] = shl i32 [[OR_3]], 1
29+
; CHECK-NEXT: [[XOR_4:%.*]] = xor i32 [[SHL_3]], 2
30+
; CHECK-NEXT: [[OR_4:%.*]] = or i32 [[AND_5]], [[XOR_4]]
31+
; CHECK-NEXT: [[XOR_5:%.*]] = xor i32 [[SHR_2]], [[OR_4]]
32+
; CHECK-NEXT: [[XOR_6:%.*]] = xor i32 [[XOR_5]], [[XOR_2]]
33+
; CHECK-NEXT: [[AND_6:%.*]] = and i32 [[XOR_6]], 255
34+
; CHECK-NEXT: [[XOR_7:%.*]] = xor i32 [[AND_6]], 1
35+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[XOR_7]]
36+
; CHECK-NEXT: [[LD:%.*]] = load i8, ptr [[GEP]], align 1
37+
; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[LD]] to i32
38+
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i32, ptr null, i32 [[ZEXT]]
39+
; CHECK-NEXT: store i32 0, ptr [[GEP_2]], align 4
40+
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
41+
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 100
42+
; CHECK-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]]
43+
; CHECK: [[EXIT]]:
44+
; CHECK-NEXT: ret void
45+
;
46+
entry:
47+
br label %loop
48+
49+
loop:
50+
%iv = phi i32 [ %iv.next, %loop ], [ %arg, %entry ]
51+
%shr = lshr i32 %iv, 1
52+
%and.1 = and i32 %iv, 1
53+
%shl.1 = shl i32 %iv, 2
54+
%shl.2 = shl i32 %iv, 1
55+
%and.2 = and i32 %shl.2, 2
56+
%or.1 = or i32 %and.2, %and.1
57+
%or.2 = or i32 %or.1, %shl.1
58+
%xor.1 = xor i32 %b, %or.2
59+
%xor.2 = xor i32 %xor.1, %arg
60+
%shr.2 = lshr i32 %shl.1, 1
61+
%xor.3 = xor i32 %shr, %arg
62+
%and.3 = and i32 %xor.3, 1
63+
%and.4 = and i32 %iv, 2147483646
64+
%or.3 = or i32 %and.3, %and.4
65+
%and.5 = and i32 %iv, 254
66+
%shl.3 = shl i32 %or.3, 1
67+
%xor.4 = xor i32 %shl.3, 2
68+
%or.4 = or i32 %and.5, %xor.4
69+
%xor.5 = xor i32 %shr.2, %or.4
70+
%xor.6 = xor i32 %xor.5, %xor.2
71+
%and.6 = and i32 %xor.6, 255
72+
%xor.7 = xor i32 %and.6, 1
73+
%gep = getelementptr i8, ptr %a, i32 %xor.7
74+
%ld = load i8, ptr %gep, align 1
75+
%zext = zext i8 %ld to i32
76+
%gep.2 = getelementptr i32, ptr null, i32 %zext
77+
store i32 0, ptr %gep.2, align 4
78+
%iv.next = add i32 %iv, 1
79+
%cmp = icmp eq i32 %iv.next, 100
80+
br i1 %cmp, label %exit, label %loop
81+
82+
exit:
83+
ret void
84+
}

0 commit comments

Comments
 (0)