Skip to content

Commit 6613b8e

Browse files
committed
[VPlan] Extend getSCEVForVPV, use to compute VPReplicateRecipe cost. (llvm#161276)
Update getSCEVExprForVPValue to handle more complex expressions, to use it in VPReplicateRecipe::comptueCost. In particular, it supports construction SCEV expressions for GetElementPtr VPReplicateRecipes, with operands that are VPScalarIVStepsRecipe, VPDerivedIVRecipe and VPCanonicalIVRecipe. If we hit a sub-expression we don't support yet, we return SCEVCouldNotCompute. Note that the SCEV expression is valid VF = 1: we only support construction AddRecs for VPCanonicalIVRecipe, which is an AddRec starting at 0 and stepping by 1. The returned SCEV expressions could be converted to a VF specific one, by rewriting the AddRecs to ones with the appropriate step. Note that the logic for constructing SCEVs for GetElementPtr was directly ported from ScalarEvolution.cpp. Another thing to note is that we construct SCEV expression purely by looking at the operation of the recipe and its translated operands, w/o accessing the underlying IR (the exception being getting the source element type for GEPs). PR: llvm#161276
1 parent 308e1b0 commit 6613b8e

File tree

3 files changed

+43
-21
lines changed

3 files changed

+43
-21
lines changed

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2968,26 +2968,31 @@ bool VPReplicateRecipe::shouldPack() const {
29682968
});
29692969
}
29702970

2971-
/// Returns true if \p Ptr is a pointer computation for which the legacy cost
2972-
/// model computes a SCEV expression when computing the address cost.
2973-
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
2971+
/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
2972+
/// which the legacy cost model computes a SCEV expression when computing the
2973+
/// address cost. Computing SCEVs for VPValues is incomplete and returns
2974+
/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
2975+
/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
2976+
static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
2977+
const Loop *L) {
2978+
using namespace llvm::VPlanPatternMatch;
29742979
auto *PtrR = Ptr->getDefiningRecipe();
29752980
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
29762981
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
29772982
Instruction::GetElementPtr) ||
29782983
isa<VPWidenGEPRecipe>(PtrR) ||
29792984
match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
2980-
return false;
2985+
return nullptr;
29812986

29822987
// We are looking for a GEP where all indices are either loop invariant or
29832988
// inductions.
29842989
for (VPValue *Opd : drop_begin(PtrR->operands())) {
29852990
if (!Opd->isDefinedOutsideLoopRegions() &&
29862991
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
2987-
return false;
2992+
return nullptr;
29882993
}
29892994

2990-
return true;
2995+
return vputils::getSCEVExprForVPValue(Ptr, SE, L);
29912996
}
29922997

29932998
/// Returns true if \p V is used as part of the address of another load or
@@ -3001,6 +3006,16 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
30013006
if (!Cur || !Seen.insert(Cur).second)
30023007
continue;
30033008

3009+
auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
3010+
// Skip blends that use V only through a compare by checking if any incoming
3011+
// value was already visited.
3012+
if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
3013+
[&](unsigned I) {
3014+
return Seen.contains(
3015+
Blend->getIncomingValue(I)->getDefiningRecipe());
3016+
}))
3017+
continue;
3018+
30043019
for (VPUser *U : Cur->users()) {
30053020
if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(U))
30063021
if (InterleaveR->getAddr() == Cur)
@@ -3019,7 +3034,13 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
30193034
}
30203035
}
30213036

3022-
append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
3037+
// The legacy cost model only supports scalarization loads/stores with phi
3038+
// addresses, if the phi is directly used as load/store address. Don't
3039+
// traverse further for Blends.
3040+
if (Blend)
3041+
continue;
3042+
3043+
append_range(WorkList, Cur->users());
30233044
}
30243045
return false;
30253046
}
@@ -3140,15 +3161,14 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31403161

31413162
bool IsLoad = UI->getOpcode() == Instruction::Load;
31423163
const VPValue *PtrOp = getOperand(!IsLoad);
3143-
// TODO: Handle cases where we need to pass a SCEV to
3144-
// getAddressComputationCost.
3145-
if (shouldUseAddressAccessSCEV(PtrOp))
3164+
const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L);
3165+
if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))
31463166
break;
31473167

31483168
Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
31493169
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
31503170
const Align Alignment = getLoadStoreAlignment(UI);
3151-
unsigned AS = getLoadStoreAddressSpace(UI);
3171+
unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
31523172
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
31533173
InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
31543174
UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
@@ -3160,7 +3180,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31603180
InstructionCost ScalarCost =
31613181
ScalarMemOpCost +
31623182
Ctx.TTI.getAddressComputationCost(
3163-
PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, nullptr);
3183+
PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, PtrSCEV);
31643184
if (isSingleScalar())
31653185
return ScalarCost;
31663186

llvm/lib/Transforms/Vectorize/VPlanUtils.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,9 @@ const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
137137
IndexExprs.push_back(IndexExpr);
138138
}
139139

140-
auto *GEP = cast<GEPOperator>(R->getUnderlyingInstr());
141-
return SE.getGEPExpr(const_cast<GEPOperator *>(GEP), IndexExprs);
140+
Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr())
141+
->getSourceElementType();
142+
return SE.getGEPExpr(Base, IndexExprs, SrcElementTy);
142143
})
143144
.Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });
144145
}

llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -464,13 +464,14 @@ define double @test_load_used_by_other_load_scev(ptr %ptr.a, ptr %ptr.b, ptr %pt
464464
; I64-NEXT: [[ENTRY:.*]]:
465465
; I64-NEXT: br label %[[OUTER_LOOP:.*]]
466466
; I64: [[OUTER_LOOP_LOOPEXIT:.*]]:
467+
; I64-NEXT: [[RESULT_LCSSA:%.*]] = phi double [ [[RESULT:%.*]], [[INNER_LOOP:%.*]] ], [ [[TMP29:%.*]], %[[MIDDLE_BLOCK:.*]] ]
467468
; I64-NEXT: br label %[[OUTER_LOOP]]
468469
; I64: [[OUTER_LOOP]]:
469-
; I64-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29:%.*]], %[[OUTER_LOOP_LOOPEXIT]] ]
470+
; I64-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT_LCSSA]], %[[OUTER_LOOP_LOOPEXIT]] ]
470471
; I64-NEXT: [[COND:%.*]] = call i1 @cond()
471-
; I64-NEXT: br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
472+
; I64-NEXT: br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], [[EXIT:label %.*]]
472473
; I64: [[INNER_LOOP_PREHEADER]]:
473-
; I64-NEXT: br label %[[VECTOR_PH:.*]]
474+
; I64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
474475
; I64: [[VECTOR_PH]]:
475476
; I64-NEXT: br label %[[VECTOR_BODY:.*]]
476477
; I64: [[VECTOR_BODY]]:
@@ -507,12 +508,12 @@ define double @test_load_used_by_other_load_scev(ptr %ptr.a, ptr %ptr.b, ptr %pt
507508
; I64-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[TMP24]], i32 1
508509
; I64-NEXT: [[TMP27:%.*]] = fdiv <2 x double> [[TMP26]], [[TMP22]]
509510
; I64-NEXT: [[TMP28:%.*]] = fsub <2 x double> [[TMP19]], [[TMP27]]
510-
; I64-NEXT: br label %[[MIDDLE_BLOCK:.*]]
511+
; I64-NEXT: br label %[[MIDDLE_BLOCK]]
511512
; I64: [[MIDDLE_BLOCK]]:
512513
; I64-NEXT: [[TMP29]] = extractelement <2 x double> [[TMP28]], i32 1
513-
; I64-NEXT: br label %[[OUTER_LOOP_LOOPEXIT]]
514-
; I64: [[EXIT]]:
515-
; I64-NEXT: ret double [[ACCUM]]
514+
; I64-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x double> [[TMP18]], i32 1
515+
; I64-NEXT: br i1 true, label %[[OUTER_LOOP_LOOPEXIT]], label %[[SCALAR_PH]]
516+
; I64: [[SCALAR_PH]]:
516517
;
517518
; I32-LABEL: define double @test_load_used_by_other_load_scev(
518519
; I32-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) {

0 commit comments

Comments
 (0)