Skip to content

Commit 63d3668

Browse files
committed
Reapply "[VPlan] Compute cost of more replicating loads/stores in ::computeCost. (llvm#160053)" (llvm#162157)
This reverts commit f80c0ba and 94eade6. Recommit a small fix for targets using prefersVectorizedAddressing. Original message: Update VPReplicateRecipe::computeCost to compute costs of more replicating loads/stores. There are 2 cases that require extra checks to match the legacy cost model: 1. If the pointer is based on an induction, the legacy cost model passes its SCEV to getAddressComputationCost. In those cases, still fall back to the legacy cost. SCEV computations will be added as follow-up 2. If a load is used as part of an address of another load, the legacy cost model skips the scalarization overhead. Those cases are currently handled by a usedByLoadOrStore helper. Note that getScalarizationOverhead also needs updating, because when the legacy cost model computes the scalarization overhead, scalars have not been collected yet, so we can't each for replicating recipes to skip their cost, except other loads. This again can be further improved by modeling inserts/extracts explicitly and consistently, and compute costs for those operations directly where needed. PR: llvm#160053 (cherry picked from commit 74af578)
1 parent 82dfa00 commit 63d3668

File tree

5 files changed

+332
-21
lines changed

5 files changed

+332
-21
lines changed

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1694,7 +1694,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
16941694
}
16951695

16961696
InstructionCost VPCostContext::getScalarizationOverhead(
1697-
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
1697+
Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
1698+
bool AlwaysIncludeReplicatingR) {
16981699
if (VF.isScalar())
16991700
return 0;
17001701

@@ -1714,7 +1715,11 @@ InstructionCost VPCostContext::getScalarizationOverhead(
17141715
SmallPtrSet<const VPValue *, 4> UniqueOperands;
17151716
SmallVector<Type *> Tys;
17161717
for (auto *Op : Operands) {
1717-
if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
1718+
if (Op->isLiveIn() ||
1719+
(!AlwaysIncludeReplicatingR &&
1720+
isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
1721+
(isa<VPReplicateRecipe>(Op) &&
1722+
cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
17181723
!UniqueOperands.insert(Op).second)
17191724
continue;
17201725
Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -377,10 +377,12 @@ struct VPCostContext {
377377

378378
/// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
379379
/// and \p Operands with \p VF. This is a convenience wrapper for the
380-
/// type-based getScalarizationOverhead API.
381-
InstructionCost getScalarizationOverhead(Type *ResultTy,
382-
ArrayRef<const VPValue *> Operands,
383-
ElementCount VF);
380+
/// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
381+
/// is true, always compute the cost of scalarizing replicating operands.
382+
InstructionCost
383+
getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
384+
ElementCount VF,
385+
bool AlwaysIncludeReplicatingR = false);
384386
};
385387

386388
/// This class can be used to assign names to VPValues. For VPValues without

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 109 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include <cassert>
4242

4343
using namespace llvm;
44+
using namespace llvm::VPlanPatternMatch;
4445

4546
using VectorParts = SmallVector<Value *, 2>;
4647

@@ -308,7 +309,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
308309
VPRecipeBase *OpR = Op->getDefiningRecipe();
309310

310311
// If the partial reduction is predicated, a select will be operand 0
311-
using namespace llvm::VPlanPatternMatch;
312312
if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
313313
OpR = Op->getDefiningRecipe();
314314
}
@@ -1819,7 +1819,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
18191819
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
18201820

18211821
VPValue *Op0, *Op1;
1822-
using namespace llvm::VPlanPatternMatch;
18231822
if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
18241823
(match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
18251824
match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
@@ -2969,6 +2968,62 @@ bool VPReplicateRecipe::shouldPack() const {
29692968
});
29702969
}
29712970

2971+
/// Returns true if \p Ptr is a pointer computation for which the legacy cost
2972+
/// model computes a SCEV expression when computing the address cost.
2973+
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
2974+
auto *PtrR = Ptr->getDefiningRecipe();
2975+
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
2976+
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
2977+
Instruction::GetElementPtr) ||
2978+
isa<VPWidenGEPRecipe>(PtrR) ||
2979+
match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
2980+
return false;
2981+
2982+
// We are looking for a GEP where all indices are either loop invariant or
2983+
// inductions.
2984+
for (VPValue *Opd : drop_begin(PtrR->operands())) {
2985+
if (!Opd->isDefinedOutsideLoopRegions() &&
2986+
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
2987+
return false;
2988+
}
2989+
2990+
return true;
2991+
}
2992+
2993+
/// Returns true if \p V is used as part of the address of another load or
2994+
/// store.
2995+
static bool isUsedByLoadStoreAddress(const VPUser *V) {
2996+
SmallPtrSet<const VPUser *, 4> Seen;
2997+
SmallVector<const VPUser *> WorkList = {V};
2998+
2999+
while (!WorkList.empty()) {
3000+
auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3001+
if (!Cur || !Seen.insert(Cur).second)
3002+
continue;
3003+
3004+
for (VPUser *U : Cur->users()) {
3005+
if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(U))
3006+
if (InterleaveR->getAddr() == Cur)
3007+
return true;
3008+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3009+
if (RepR->getOpcode() == Instruction::Load &&
3010+
RepR->getOperand(0) == Cur)
3011+
return true;
3012+
if (RepR->getOpcode() == Instruction::Store &&
3013+
RepR->getOperand(1) == Cur)
3014+
return true;
3015+
}
3016+
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3017+
if (MemR->getAddr() == Cur && MemR->isConsecutive())
3018+
return true;
3019+
}
3020+
}
3021+
3022+
append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
3023+
}
3024+
return false;
3025+
}
3026+
29723027
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
29733028
VPCostContext &Ctx) const {
29743029
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3074,21 +3129,60 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
30743129
}
30753130
case Instruction::Load:
30763131
case Instruction::Store: {
3077-
if (isSingleScalar()) {
3078-
bool IsLoad = UI->getOpcode() == Instruction::Load;
3079-
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3080-
Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
3081-
const Align Alignment = getLoadStoreAlignment(UI);
3082-
unsigned AS = getLoadStoreAddressSpace(UI);
3083-
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3084-
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3085-
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
3086-
return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3087-
ScalarPtrTy, nullptr, nullptr);
3088-
}
3132+
if (VF.isScalable() && !isSingleScalar())
3133+
return InstructionCost::getInvalid();
3134+
30893135
// TODO: See getMemInstScalarizationCost for how to handle replicating and
30903136
// predicated cases.
3091-
break;
3137+
const VPRegionBlock *ParentRegion = getParent()->getParent();
3138+
if (ParentRegion && ParentRegion->isReplicator())
3139+
break;
3140+
3141+
bool IsLoad = UI->getOpcode() == Instruction::Load;
3142+
const VPValue *PtrOp = getOperand(!IsLoad);
3143+
// TODO: Handle cases where we need to pass a SCEV to
3144+
// getAddressComputationCost.
3145+
if (shouldUseAddressAccessSCEV(PtrOp))
3146+
break;
3147+
3148+
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3149+
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3150+
const Align Alignment = getLoadStoreAlignment(UI);
3151+
unsigned AS = getLoadStoreAddressSpace(UI);
3152+
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
3153+
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3154+
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3155+
3156+
Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3157+
bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3158+
bool UsedByLoadStoreAddress =
3159+
!PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
3160+
InstructionCost ScalarCost =
3161+
ScalarMemOpCost +
3162+
Ctx.TTI.getAddressComputationCost(
3163+
PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, nullptr);
3164+
if (isSingleScalar())
3165+
return ScalarCost;
3166+
3167+
SmallVector<const VPValue *> OpsToScalarize;
3168+
Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3169+
// Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3170+
// don't assign scalarization overhead in general, if the target prefers
3171+
// vectorized addressing or the loaded value is used as part of an address
3172+
// of another load or store.
3173+
if (!UsedByLoadStoreAddress) {
3174+
bool EfficientVectorLoadStore =
3175+
Ctx.TTI.supportsEfficientVectorElementLoadStore();
3176+
if (!(IsLoad && !PreferVectorizedAddressing) &&
3177+
!(!IsLoad && EfficientVectorLoadStore))
3178+
append_range(OpsToScalarize, operands());
3179+
3180+
if (!EfficientVectorLoadStore)
3181+
ResultTy = Ctx.Types.inferScalarType(this);
3182+
}
3183+
3184+
return (ScalarCost * VF.getFixedValue()) +
3185+
Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
30923186
}
30933187
}
30943188

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt -p loop-vectorize -S %s | FileCheck %s
3+
4+
target triple = "armv7-unknown-linux-gnueabihf"
5+
6+
define void @replicating_load_used_by_other_load(i32 %arg, ptr %a, i32 %b) {
7+
; CHECK-LABEL: define void @replicating_load_used_by_other_load(
8+
; CHECK-SAME: i32 [[ARG:%.*]], ptr [[A:%.*]], i32 [[B:%.*]]) {
9+
; CHECK-NEXT: [[ENTRY:.*]]:
10+
; CHECK-NEXT: br label %[[LOOP:.*]]
11+
; CHECK: [[LOOP]]:
12+
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[ARG]], %[[ENTRY]] ]
13+
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[IV]], 1
14+
; CHECK-NEXT: [[AND_1:%.*]] = and i32 [[IV]], 1
15+
; CHECK-NEXT: [[SHL_1:%.*]] = shl i32 [[IV]], 2
16+
; CHECK-NEXT: [[SHL_2:%.*]] = shl i32 [[IV]], 1
17+
; CHECK-NEXT: [[AND_2:%.*]] = and i32 [[SHL_2]], 2
18+
; CHECK-NEXT: [[OR_1:%.*]] = or i32 [[AND_2]], [[AND_1]]
19+
; CHECK-NEXT: [[OR_2:%.*]] = or i32 [[OR_1]], [[SHL_1]]
20+
; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[B]], [[OR_2]]
21+
; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[XOR_1]], [[ARG]]
22+
; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[SHL_1]], 1
23+
; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[SHR]], [[ARG]]
24+
; CHECK-NEXT: [[AND_3:%.*]] = and i32 [[XOR_3]], 1
25+
; CHECK-NEXT: [[AND_4:%.*]] = and i32 [[IV]], 2147483646
26+
; CHECK-NEXT: [[OR_3:%.*]] = or i32 [[AND_3]], [[AND_4]]
27+
; CHECK-NEXT: [[AND_5:%.*]] = and i32 [[IV]], 254
28+
; CHECK-NEXT: [[SHL_3:%.*]] = shl i32 [[OR_3]], 1
29+
; CHECK-NEXT: [[XOR_4:%.*]] = xor i32 [[SHL_3]], 2
30+
; CHECK-NEXT: [[OR_4:%.*]] = or i32 [[AND_5]], [[XOR_4]]
31+
; CHECK-NEXT: [[XOR_5:%.*]] = xor i32 [[SHR_2]], [[OR_4]]
32+
; CHECK-NEXT: [[XOR_6:%.*]] = xor i32 [[XOR_5]], [[XOR_2]]
33+
; CHECK-NEXT: [[AND_6:%.*]] = and i32 [[XOR_6]], 255
34+
; CHECK-NEXT: [[XOR_7:%.*]] = xor i32 [[AND_6]], 1
35+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[XOR_7]]
36+
; CHECK-NEXT: [[LD:%.*]] = load i8, ptr [[GEP]], align 1
37+
; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[LD]] to i32
38+
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i32, ptr null, i32 [[ZEXT]]
39+
; CHECK-NEXT: store i32 0, ptr [[GEP_2]], align 4
40+
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
41+
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 100
42+
; CHECK-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]]
43+
; CHECK: [[EXIT]]:
44+
; CHECK-NEXT: ret void
45+
;
46+
entry:
47+
br label %loop
48+
49+
loop:
50+
%iv = phi i32 [ %iv.next, %loop ], [ %arg, %entry ]
51+
%shr = lshr i32 %iv, 1
52+
%and.1 = and i32 %iv, 1
53+
%shl.1 = shl i32 %iv, 2
54+
%shl.2 = shl i32 %iv, 1
55+
%and.2 = and i32 %shl.2, 2
56+
%or.1 = or i32 %and.2, %and.1
57+
%or.2 = or i32 %or.1, %shl.1
58+
%xor.1 = xor i32 %b, %or.2
59+
%xor.2 = xor i32 %xor.1, %arg
60+
%shr.2 = lshr i32 %shl.1, 1
61+
%xor.3 = xor i32 %shr, %arg
62+
%and.3 = and i32 %xor.3, 1
63+
%and.4 = and i32 %iv, 2147483646
64+
%or.3 = or i32 %and.3, %and.4
65+
%and.5 = and i32 %iv, 254
66+
%shl.3 = shl i32 %or.3, 1
67+
%xor.4 = xor i32 %shl.3, 2
68+
%or.4 = or i32 %and.5, %xor.4
69+
%xor.5 = xor i32 %shr.2, %or.4
70+
%xor.6 = xor i32 %xor.5, %xor.2
71+
%and.6 = and i32 %xor.6, 255
72+
%xor.7 = xor i32 %and.6, 1
73+
%gep = getelementptr i8, ptr %a, i32 %xor.7
74+
%ld = load i8, ptr %gep, align 1
75+
%zext = zext i8 %ld to i32
76+
%gep.2 = getelementptr i32, ptr null, i32 %zext
77+
store i32 0, ptr %gep.2, align 4
78+
%iv.next = add i32 %iv, 1
79+
%cmp = icmp eq i32 %iv.next, 100
80+
br i1 %cmp, label %exit, label %loop
81+
82+
exit:
83+
ret void
84+
}

0 commit comments

Comments
 (0)