Skip to content

Commit 7b146a6

Browse files
committed
[LV] Compute register usage for interleaving on VPlan.
Add a version of calculateRegisterUsage that works estimates register usage for a VPlan. This mostly just ports the existing code, with some updates to figure out what recipes will generate vectors vs scalars. There are number of changes in the computed register usages, but they should be more accurate w.r.t. to the generated vector code. There are the following changes: * Scalar usage increases in most cases by 1, as we always create a scalar canonical IV, which is alive across the loop and is not considered by the legacy implementation * Output is ordered by insertion, now scalar registers are added first due the canonical IV phi. * Using the VPlan, we now also more precisely know if an induction will be vectorized or scalarized.
1 parent cd0a2a3 commit 7b146a6

File tree

13 files changed

+338
-143
lines changed

13 files changed

+338
-143
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 229 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,7 +1018,8 @@ class LoopVectorizationCostModel {
10181018
/// If interleave count has been specified by metadata it will be returned.
10191019
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
10201020
/// are the selected vectorization factor and the cost of the selected VF.
1021-
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1021+
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
1022+
InstructionCost LoopCost);
10221023

10231024
/// Memory access instruction may be vectorized in more than one way.
10241025
/// Form of instruction after vectorization depends on cost.
@@ -4885,8 +4886,232 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
48854886
}
48864887
}
48874888

4889+
/// Estimate the register usage for \p Plan and vectorization factors in \p VFs.
4890+
/// Returns the register usage for each VF in \p VFs.
4891+
static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
4892+
calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
4893+
const TargetTransformInfo &TTI) {
4894+
// This function calculates the register usage by measuring the highest number
4895+
// of values that are alive at a single location. Obviously, this is a very
4896+
// rough estimation. We scan the loop in a topological order in order and
4897+
// assign a number to each recipe. We use RPO to ensure that defs are
4898+
// met before their users. We assume that each recipe that has in-loop
4899+
// users starts an interval. We record every time that an in-loop value is
4900+
// used, so we have a list of the first and last occurrences of each
4901+
// recipe. Next, we transpose this data structure into a multi map that
4902+
// holds the list of intervals that *end* at a specific location. This multi
4903+
// map allows us to perform a linear search. We scan the instructions linearly
4904+
// and record each time that a new interval starts, by placing it in a set.
4905+
// If we find this value in the multi-map then we remove it from the set.
4906+
// The max register usage is the maximum size of the set.
4907+
// We also search for instructions that are defined outside the loop, but are
4908+
// used inside the loop. We need this number separately from the max-interval
4909+
// usage number because when we unroll, loop-invariant values do not take
4910+
// more register.
4911+
LoopVectorizationCostModel::RegisterUsage RU;
4912+
4913+
// Each 'key' in the map opens a new interval. The values
4914+
// of the map are the index of the 'last seen' usage of the
4915+
// recipe that is the key.
4916+
using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;
4917+
4918+
// Maps recipe to its index.
4919+
SmallVector<VPRecipeBase *, 64> IdxToRecipe;
4920+
// Marks the end of each interval.
4921+
IntervalMap EndPoint;
4922+
// Saves the list of recipe indices that are used in the loop.
4923+
SmallPtrSet<VPRecipeBase *, 8> Ends;
4924+
// Saves the list of values that are used in the loop but are defined outside
4925+
// the loop (not including non-recipe values such as arguments and
4926+
// constants).
4927+
SmallSetVector<VPValue *, 8> LoopInvariants;
4928+
LoopInvariants.insert(&Plan.getVectorTripCount());
4929+
4930+
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
4931+
Plan.getVectorLoopRegion());
4932+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4933+
if (!VPBB->getParent())
4934+
break;
4935+
for (VPRecipeBase &R : *VPBB) {
4936+
IdxToRecipe.push_back(&R);
4937+
4938+
// Save the end location of each USE.
4939+
for (VPValue *U : R.operands()) {
4940+
auto *DefR = U->getDefiningRecipe();
4941+
4942+
// Ignore non-recipe values such as arguments, constants, etc.
4943+
// FIXME: Might need some motivation why these values are ignored. If
4944+
// for example an argument is used inside the loop it will increase the
4945+
// register pressure (so shouldn't we add it to LoopInvariants).
4946+
if (!DefR && (!U->getLiveInIRValue() ||
4947+
!isa<Instruction>(U->getLiveInIRValue())))
4948+
continue;
4949+
4950+
// If this recipe is outside the loop then record it and continue.
4951+
if (!DefR) {
4952+
LoopInvariants.insert(U);
4953+
continue;
4954+
}
4955+
4956+
// Overwrite previous end points.
4957+
EndPoint[DefR] = IdxToRecipe.size();
4958+
Ends.insert(DefR);
4959+
}
4960+
}
4961+
if (VPBB == Plan.getVectorLoopRegion()->getExiting()) {
4962+
// VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4963+
// exiting block, where their increment will get materialized eventually.
4964+
for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
4965+
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4966+
EndPoint[&R] = IdxToRecipe.size();
4967+
Ends.insert(&R);
4968+
}
4969+
}
4970+
}
4971+
}
4972+
4973+
// Saves the list of intervals that end with the index in 'key'.
4974+
using RecipeList = SmallVector<VPRecipeBase *, 2>;
4975+
SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;
4976+
4977+
// Transpose the EndPoints to a list of values that end at each index.
4978+
for (auto &Interval : EndPoint)
4979+
TransposeEnds[Interval.second].push_back(Interval.first);
4980+
4981+
SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
4982+
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> RUs(VFs.size());
4983+
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
4984+
4985+
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
4986+
4987+
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4988+
4989+
const auto &TTICapture = TTI;
4990+
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4991+
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
4992+
(VF.isScalable() &&
4993+
!TTICapture.isElementTypeLegalForScalableVector(Ty)))
4994+
return 0;
4995+
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
4996+
};
4997+
4998+
for (unsigned int Idx = 0, Sz = IdxToRecipe.size(); Idx < Sz; ++Idx) {
4999+
VPRecipeBase *R = IdxToRecipe[Idx];
5000+
5001+
// Remove all of the recipes that end at this location.
5002+
RecipeList &List = TransposeEnds[Idx];
5003+
for (VPRecipeBase *ToRemove : List)
5004+
OpenIntervals.erase(ToRemove);
5005+
5006+
// Ignore recipes that are never used within the loop.
5007+
if (!Ends.count(R) && !R->mayHaveSideEffects())
5008+
continue;
5009+
5010+
// For each VF find the maximum usage of registers.
5011+
for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5012+
// Count the number of registers used, per register class, given all open
5013+
// intervals.
5014+
// Note that elements in this SmallMapVector will be default constructed
5015+
// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5016+
// there is no previous entry for ClassID.
5017+
SmallMapVector<unsigned, unsigned, 4> RegUsage;
5018+
5019+
if (VFs[J].isScalar()) {
5020+
for (auto *Inst : OpenIntervals) {
5021+
for (VPValue *DefV : Inst->definedValues()) {
5022+
unsigned ClassID = TTI.getRegisterClassForType(
5023+
false, TypeInfo.inferScalarType(DefV));
5024+
// FIXME: The target might use more than one register for the type
5025+
// even in the scalar case.
5026+
RegUsage[ClassID] += 1;
5027+
}
5028+
}
5029+
} else {
5030+
for (auto *R : OpenIntervals) {
5031+
if (isa<VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(R))
5032+
continue;
5033+
if (isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
5034+
VPScalarIVStepsRecipe>(R) ||
5035+
(isa<VPInstruction>(R) &&
5036+
all_of(cast<VPSingleDefRecipe>(R)->users(), [&](VPUser *U) {
5037+
return cast<VPRecipeBase>(U)->usesScalars(
5038+
R->getVPSingleValue());
5039+
}))) {
5040+
unsigned ClassID = TTI.getRegisterClassForType(
5041+
false, TypeInfo.inferScalarType(R->getVPSingleValue()));
5042+
// FIXME: The target might use more than one register for the type
5043+
// even in the scalar case.
5044+
RegUsage[ClassID] += 1;
5045+
} else {
5046+
for (VPValue *DefV : R->definedValues()) {
5047+
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
5048+
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
5049+
RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
5050+
}
5051+
}
5052+
}
5053+
}
5054+
5055+
for (const auto &Pair : RegUsage) {
5056+
auto &Entry = MaxUsages[J][Pair.first];
5057+
Entry = std::max(Entry, Pair.second);
5058+
}
5059+
}
5060+
5061+
LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5062+
<< OpenIntervals.size() << '\n');
5063+
5064+
// Add the current recipe to the list of open intervals.
5065+
OpenIntervals.insert(R);
5066+
}
5067+
5068+
for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5069+
// Note that elements in this SmallMapVector will be default constructed
5070+
// as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5071+
// there is no previous entry for ClassID.
5072+
SmallMapVector<unsigned, unsigned, 4> Invariant;
5073+
5074+
for (auto *In : LoopInvariants) {
5075+
// FIXME: The target might use more than one register for the type
5076+
// even in the scalar case.
5077+
bool IsScalar = all_of(In->users(), [&](VPUser *U) {
5078+
return cast<VPRecipeBase>(U)->usesScalars(In);
5079+
});
5080+
5081+
ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5082+
unsigned ClassID = TTI.getRegisterClassForType(
5083+
VF.isVector(), TypeInfo.inferScalarType(In));
5084+
Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF);
5085+
}
5086+
5087+
LLVM_DEBUG({
5088+
dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5089+
dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5090+
<< " item\n";
5091+
for (const auto &pair : MaxUsages[Idx]) {
5092+
dbgs() << "LV(REG): RegisterClass: "
5093+
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5094+
<< " registers\n";
5095+
}
5096+
dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5097+
<< " item\n";
5098+
for (const auto &pair : Invariant) {
5099+
dbgs() << "LV(REG): RegisterClass: "
5100+
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5101+
<< " registers\n";
5102+
}
5103+
});
5104+
5105+
RU.LoopInvariantRegs = Invariant;
5106+
RU.MaxLocalUsers = MaxUsages[Idx];
5107+
RUs[Idx] = RU;
5108+
}
5109+
5110+
return RUs;
5111+
}
5112+
48885113
unsigned
4889-
LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5114+
LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48905115
InstructionCost LoopCost) {
48915116
// -- The interleave heuristics --
48925117
// We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4936,7 +5161,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49365161
return 1;
49375162
}
49385163

4939-
RegisterUsage R = calculateRegisterUsage({VF})[0];
5164+
RegisterUsage R = ::calculateRegisterUsage(Plan, {VF}, TTI)[0];
49405165
// We divide by these constants so assume that we have at least one
49415166
// instruction that uses at least one register.
49425167
for (auto &Pair : R.MaxLocalUsers) {
@@ -10694,7 +10919,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1069410919
AddBranchWeights, CM.CostKind);
1069510920
if (LVP.hasPlanWithVF(VF.Width)) {
1069610921
// Select the interleave count.
10697-
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10922+
IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
1069810923

1069910924
unsigned SelectedIC = std::max(IC, UserIC);
1070010925
// Optimistically generate runtime checks if they are needed. Drop them if

llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ target triple = "aarch64"
88
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
99
; CHECK: LV(REG): VF = 32
1010
; CHECK-NEXT: LV(REG): Found max usage: 2 item
11+
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
1112
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
12-
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
1313

1414
define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
1515
entry:
@@ -31,8 +31,8 @@ loop:
3131
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
3232
; CHECK: LV(REG): VF = 64
3333
; CHECK-NEXT: LV(REG): Found max usage: 2 item
34+
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
3435
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
35-
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
3636

3737
define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
3838
entry:

llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,10 @@ define void @get_invariant_reg_usage(ptr %z) {
1616
; CHECK-LABEL: LV: Checking a loop in 'get_invariant_reg_usage'
1717
; CHECK: LV(REG): VF = vscale x 16
1818
; CHECK-NEXT: LV(REG): Found max usage: 2 item
19-
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers
20-
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 1 registers
21-
; CHECK-NEXT: LV(REG): Found invariant usage: 2 item
2219
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
23-
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 8 registers
20+
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 1 registers
21+
; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
22+
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers
2423

2524
L.entry:
2625
%0 = load i128, ptr %z, align 16

llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,18 @@
99
define void @bar(ptr %A, i32 signext %n) {
1010
; CHECK-LABEL: bar
1111
; CHECK-SCALAR: LV(REG): Found max usage: 2 item
12-
; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 2 registers
12+
; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 3 registers
1313
; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::FPRRC, 1 registers
1414
; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item
1515
; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
1616
; CHECK-SCALAR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class
1717
; CHECK-SCALAR-NEXT: LV: The target has 32 registers of LoongArch::FPRRC register class
1818
; CHECK-VECTOR: LV(REG): Found max usage: 2 item
19-
; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::VRRC, 3 registers
20-
; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
19+
; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 2 registers
20+
; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::VRRC, 2 registers
2121
; CHECK-VECTOR-NEXT: LV(REG): Found invariant usage: 1 item
2222
; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
23+
; CHECK-VECTOR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class
2324
; CHECK-VECTOR-NEXT: LV: The target has 32 registers of LoongArch::VRRC register class
2425

2526
entry:

0 commit comments

Comments
 (0)