Skip to content

Commit 38569d9

Browse files
author
Vasileios Porpodas
committed
[Spill2Reg] Add live register tracking
This patch implements tracking of live registers. This is used to look for free vector registers. It works by walking up the CFG from the reloads all the way to the spills, accumulating the register units being used. This implementation caches the live register units used by each MBB for faster compilation time. Note: Live register tracking relies on MBB liveins/outs being maintained correctly, which is implemented in a follow-up patch. So this patch will still not generate correct code for all but some simple cases. Original review: https://reviews.llvm.org/D118303
1 parent 92e26f0 commit 38569d9

12 files changed

+1170
-33
lines changed

llvm/lib/CodeGen/Spill2Reg.cpp

Lines changed: 130 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ class Spill2Reg : public MachineFunctionPass {
9999
/// Helper for generateCode(). It eplaces stack spills or reloads with movs
100100
/// to \p LI.reg().
101101
void replaceStackWithReg(StackSlotDataEntry &Entry, Register VectorReg);
102+
/// Updates the live-ins of MBBs after we emit the new spill2reg instructions
103+
/// and the vector registers become live from register spills to reloads.
104+
void updateLiveIns(StackSlotDataEntry &Entry, MCRegister VectorReg);
102105
/// Updates \p LRU with the liveness of physical registers around the spills
103106
/// and reloads in \p Entry.
104107
void calculateLiveRegs(StackSlotDataEntry &Entry, LiveRegUnits &LRU);
@@ -111,6 +114,9 @@ class Spill2Reg : public MachineFunctionPass {
111114

112115
/// Map from a stack slot to the corresponding spills and reloads.
113116
DenseMap<int, StackSlotDataEntry> StackSlotData;
117+
/// The registers used by each block (from LiveRegUnits). This is needed for
118+
/// finding free physical registers in the generateCode().
119+
DenseMap<const MachineBasicBlock *, LiveRegUnits> LRUs;
114120

115121
MachineFunction *MF = nullptr;
116122
MachineRegisterInfo *MRI = nullptr;
@@ -169,7 +175,16 @@ void Spill2Reg::collectSpillsAndReloads() {
169175
// If any spill/reload for a stack slot is found not to be eligible for
170176
// spill-to-reg, then that stack slot is disabled.
171177
for (MachineBasicBlock &MBB : *MF) {
172-
for (MachineInstr &MI : MBB) {
178+
// Initialize AccumMBBLRU for keeping track of physical registers used
179+
// across the whole MBB.
180+
LiveRegUnits AccumMBBLRU(*TRI);
181+
AccumMBBLRU.addLiveOuts(MBB);
182+
183+
// Collect spills/reloads
184+
for (MachineInstr &MI : llvm::reverse(MBB)) {
185+
// Update the LRU state as we move upwards.
186+
AccumMBBLRU.accumulate(MI);
187+
173188
int StackSlot;
174189
if (const MachineOperand *MO = TII->isStoreToStackSlotMO(MI, StackSlot)) {
175190
MachineInstr *Spill = &MI;
@@ -203,6 +218,8 @@ void Spill2Reg::collectSpillsAndReloads() {
203218
}
204219
}
205220
}
221+
222+
LRUs.insert(std::make_pair(&MBB, AccumMBBLRU));
206223
}
207224
}
208225

@@ -228,6 +245,26 @@ Spill2Reg::tryGetFreePhysicalReg(const TargetRegisterClass *RegClass,
228245
return std::nullopt;
229246
}
230247

248+
/// Perform a bottom-up depth-first traversal from \p MBB at \p MI towards its
249+
/// predecessors blocks. Visited marks the visited blocks. \p Fn is the
250+
/// callback function called in pre-order. If \p Fn returns true we stop the
251+
/// traversal.
252+
// TODO: Use df_iterator
253+
static void DFS(MachineBasicBlock *MBB, DenseSet<MachineBasicBlock *> &Visited,
254+
std::function<bool(MachineBasicBlock *)> Fn) {
255+
// Skip visited to avoid infinite loops.
256+
if (Visited.count(MBB))
257+
return;
258+
Visited.insert(MBB);
259+
260+
// Preorder.
261+
if (Fn(MBB))
262+
return;
263+
264+
// Depth-first across predecessors.
265+
for (MachineBasicBlock *PredMBB : MBB->predecessors())
266+
DFS(PredMBB, Visited, Fn);
267+
}
231268
// Replace stack-based spills/reloads with register-based ones.
232269
void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
233270
Register VectorReg) {
@@ -236,10 +273,13 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
236273
assert(SpillData.MO->isReg() && "Expected register MO");
237274
Register OldReg = SpillData.MO->getReg();
238275

239-
MachineInstr *SpillToVector = TII->spill2RegInsertToVectorReg(
276+
TII->spill2RegInsertToVectorReg(
240277
VectorReg, OldReg, SpillData.SpillBits, StackSpill->getParent(),
241278
/*InsertBeforeIt=*/StackSpill->getIterator(), TRI);
242279

280+
// Mark VectorReg as live in the instr's BB.
281+
LRUs[StackSpill->getParent()].addReg(VectorReg);
282+
243283
// Spill to stack is no longer needed.
244284
StackSpill->eraseFromParent();
245285
assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
@@ -250,10 +290,13 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
250290
assert(ReloadData.MO->isReg() && "Expected Reg MO");
251291
Register OldReg = ReloadData.MO->getReg();
252292

253-
MachineInstr *ReloadFromReg = TII->spill2RegExtractFromVectorReg(
293+
TII->spill2RegExtractFromVectorReg(
254294
OldReg, VectorReg, ReloadData.SpillBits, StackReload->getParent(),
255295
/*InsertBeforeIt=*/StackReload->getIterator(), TRI);
256296

297+
// Mark VectorReg as live in the instr's BB.
298+
LRUs[StackReload->getParent()].addReg(VectorReg);
299+
257300
// Reload from stack is no longer needed.
258301
StackReload->eraseFromParent();
259302
assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
@@ -262,7 +305,86 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
262305

263306
void Spill2Reg::calculateLiveRegs(StackSlotDataEntry &Entry,
264307
LiveRegUnits &LRU) {
265-
// TODO: Unimplemented
308+
// Collect the parent MBBs of Spills for fast lookup.
309+
DenseSet<MachineBasicBlock *> SpillMBBs(Entry.Spills.size());
310+
DenseSet<MachineInstr *> Spills(Entry.Spills.size());
311+
for (const auto &Data : Entry.Spills) {
312+
SpillMBBs.insert(Data.MI->getParent());
313+
Spills.insert(Data.MI);
314+
}
315+
316+
/// Walks up the instructions in \p Reload's block, stopping at a spill if
317+
/// found. \Returns true if a spill was found, false otherwise.
318+
auto AccumulateLRUUntilSpillFn = [&Spills, &SpillMBBs](MachineInstr *Reload,
319+
LiveRegUnits &LRU) {
320+
MachineBasicBlock *MBB = Reload->getParent();
321+
bool IsSpillBlock = SpillMBBs.count(MBB);
322+
// Add all MBB's live-outs.
323+
LRU.addLiveOuts(*MBB);
324+
// Else walk up the BB, starting from MI, looking for any spill.
325+
for (MachineInstr *CurrMI = Reload; CurrMI != nullptr;
326+
CurrMI = CurrMI->getPrevNode()) {
327+
LRU.accumulate(*CurrMI);
328+
// If a spill is found then return true to end the recursion.
329+
if (IsSpillBlock && Spills.count(CurrMI))
330+
return true;
331+
}
332+
return false;
333+
};
334+
335+
// Helper for the traversal. It accumulates all register units used in \p
336+
// MBB from \p MI upwards. It returns true once a spill is found.
337+
auto AccumulateLRUFn = [&SpillMBBs, &LRU, AccumulateLRUUntilSpillFn,
338+
this](MachineBasicBlock *MBB) {
339+
if (SpillMBBs.count(MBB)) {
340+
// If this is a spill block, then walk bottom-up until the spill.
341+
assert(!MBB->empty() && "How can it be a spill block and empty?");
342+
bool FoundSpill = AccumulateLRUUntilSpillFn(&*MBB->rbegin(), LRU);
343+
assert(FoundSpill && "Spill block but we couldn't find spill!");
344+
// We return true to stop the recursion.
345+
return true;
346+
}
347+
// Else this is an intermediate block between the spills and reloads and
348+
// there is no spill in it, then use the pre-computed LRU to avoid walking
349+
// it again. This improves compilation time.
350+
LRU.addUnits(LRUs[MBB].getBitVector());
351+
// We return false to continue the recursion.
352+
return false;
353+
};
354+
355+
/// \Returns the LiveRegUnits at `Reload` by stepping back the BB.
356+
auto GetReloadLRU = [this](MachineInstr *Reload) {
357+
LiveRegUnits ReloadLRU(*TRI);
358+
MachineBasicBlock *MBB = Reload->getParent();
359+
ReloadLRU.addLiveOuts(*MBB);
360+
// Start at the bottom of the BB and walk up until we find `Reload`.
361+
for (MachineInstr &MI : llvm::reverse(*MBB)) {
362+
if (&MI == Reload)
363+
break;
364+
// TODO: Check if this should be accumulate() instead of stepBackward().
365+
ReloadLRU.stepBackward(MI);
366+
}
367+
return ReloadLRU;
368+
};
369+
370+
// Start from each Reload and walk up the CFG with a depth-first traversal,
371+
// looking for spills. Upon finding a spill we don't go beyond that point. In
372+
// the meantime we accumulate the registers used. This is then used to find
373+
// free physical registes.
374+
DenseSet<MachineBasicBlock *> Visited;
375+
for (const auto &ReloadData : Entry.Reloads) {
376+
MachineInstr *Reload = ReloadData.MI;
377+
// Add the Reload's LRU to the total LRU for the whole Spill-Reload range.
378+
LiveRegUnits ReloadLRU = GetReloadLRU(Reload);
379+
bool FoundSpill = AccumulateLRUUntilSpillFn(Reload, ReloadLRU);
380+
LRU.addUnits(ReloadLRU.getBitVector());
381+
382+
// Traverse the CFG bottom-up accumulating LRUs until we reach the Spills.
383+
if (!FoundSpill) {
384+
for (MachineBasicBlock *PredMBB : Reload->getParent()->predecessors())
385+
DFS(PredMBB, Visited, AccumulateLRUFn);
386+
}
387+
}
266388
}
267389

268390
void Spill2Reg::generateCode() {
@@ -293,7 +415,10 @@ void Spill2Reg::generateCode() {
293415
}
294416
}
295417

296-
void Spill2Reg::cleanup() { StackSlotData.clear(); }
418+
void Spill2Reg::cleanup() {
419+
StackSlotData.clear();
420+
LRUs.clear();
421+
}
297422

298423
bool Spill2Reg::run() {
299424
// Walk over each instruction in the code keeping track of the processor's
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
3+
4+
; End-to-end check that Spill2Reg works with 16-bit registers.
5+
6+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
7+
target triple = "x86_64-unknown-linux-gnu"
8+
9+
@D0 = dso_local local_unnamed_addr global i16 0, align 4
10+
@D1 = dso_local local_unnamed_addr global i16 0, align 4
11+
@D2 = dso_local local_unnamed_addr global i16 0, align 4
12+
@D3 = dso_local local_unnamed_addr global i16 0, align 4
13+
@D4 = dso_local local_unnamed_addr global i16 0, align 4
14+
@D5 = dso_local local_unnamed_addr global i16 0, align 4
15+
@D6 = dso_local local_unnamed_addr global i16 0, align 4
16+
@D7 = dso_local local_unnamed_addr global i16 0, align 4
17+
@D8 = dso_local local_unnamed_addr global i16 0, align 4
18+
@D9 = dso_local local_unnamed_addr global i16 0, align 4
19+
@D10 = dso_local local_unnamed_addr global i16 0, align 4
20+
@D11 = dso_local local_unnamed_addr global i16 0, align 4
21+
@D12 = dso_local local_unnamed_addr global i16 0, align 4
22+
@D13 = dso_local local_unnamed_addr global i16 0, align 4
23+
@D14 = dso_local local_unnamed_addr global i16 0, align 4
24+
@D15 = dso_local local_unnamed_addr global i16 0, align 4
25+
@D16 = dso_local local_unnamed_addr global i16 0, align 4
26+
@D17 = dso_local local_unnamed_addr global i16 0, align 4
27+
@D18 = dso_local local_unnamed_addr global i16 0, align 4
28+
@U0 = dso_local local_unnamed_addr global i16 0, align 4
29+
@U1 = dso_local local_unnamed_addr global i16 0, align 4
30+
@U2 = dso_local local_unnamed_addr global i16 0, align 4
31+
@U3 = dso_local local_unnamed_addr global i16 0, align 4
32+
@U4 = dso_local local_unnamed_addr global i16 0, align 4
33+
@U5 = dso_local local_unnamed_addr global i16 0, align 4
34+
@U6 = dso_local local_unnamed_addr global i16 0, align 4
35+
@U7 = dso_local local_unnamed_addr global i16 0, align 4
36+
@U8 = dso_local local_unnamed_addr global i16 0, align 4
37+
@U9 = dso_local local_unnamed_addr global i16 0, align 4
38+
@U10 = dso_local local_unnamed_addr global i16 0, align 4
39+
@U11 = dso_local local_unnamed_addr global i16 0, align 4
40+
@U12 = dso_local local_unnamed_addr global i16 0, align 4
41+
@U13 = dso_local local_unnamed_addr global i16 0, align 4
42+
@U14 = dso_local local_unnamed_addr global i16 0, align 4
43+
@U15 = dso_local local_unnamed_addr global i16 0, align 4
44+
@U16 = dso_local local_unnamed_addr global i16 0, align 4
45+
@U17 = dso_local local_unnamed_addr global i16 0, align 4
46+
@U18 = dso_local local_unnamed_addr global i16 0, align 4
47+
48+
; Function Attrs: mustprogress noinline nounwind uwtable
49+
define dso_local void @_Z5spillv() local_unnamed_addr #0 {
50+
; CHECK-LABEL: _Z5spillv:
51+
; CHECK: # %bb.0: # %entry
52+
; CHECK-NEXT: pushq %rbp
53+
; CHECK-NEXT: .cfi_def_cfa_offset 16
54+
; CHECK-NEXT: pushq %r15
55+
; CHECK-NEXT: .cfi_def_cfa_offset 24
56+
; CHECK-NEXT: pushq %r14
57+
; CHECK-NEXT: .cfi_def_cfa_offset 32
58+
; CHECK-NEXT: pushq %r13
59+
; CHECK-NEXT: .cfi_def_cfa_offset 40
60+
; CHECK-NEXT: pushq %r12
61+
; CHECK-NEXT: .cfi_def_cfa_offset 48
62+
; CHECK-NEXT: pushq %rbx
63+
; CHECK-NEXT: .cfi_def_cfa_offset 56
64+
; CHECK-NEXT: .cfi_offset %rbx, -56
65+
; CHECK-NEXT: .cfi_offset %r12, -48
66+
; CHECK-NEXT: .cfi_offset %r13, -40
67+
; CHECK-NEXT: .cfi_offset %r14, -32
68+
; CHECK-NEXT: .cfi_offset %r15, -24
69+
; CHECK-NEXT: .cfi_offset %rbp, -16
70+
; CHECK-NEXT: movzwl D0(%rip), %eax
71+
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
72+
; CHECK-NEXT: movzwl D1(%rip), %ecx
73+
; CHECK-NEXT: movzwl D2(%rip), %edx
74+
; CHECK-NEXT: movzwl D3(%rip), %esi
75+
; CHECK-NEXT: movzwl D4(%rip), %edi
76+
; CHECK-NEXT: movzwl D5(%rip), %r8d
77+
; CHECK-NEXT: movzwl D6(%rip), %r9d
78+
; CHECK-NEXT: movzwl D7(%rip), %r10d
79+
; CHECK-NEXT: movzwl D8(%rip), %r11d
80+
; CHECK-NEXT: movzwl D9(%rip), %ebx
81+
; CHECK-NEXT: movzwl D10(%rip), %ebp
82+
; CHECK-NEXT: movzwl D11(%rip), %r14d
83+
; CHECK-NEXT: movzwl D12(%rip), %r15d
84+
; CHECK-NEXT: movzwl D13(%rip), %r12d
85+
; CHECK-NEXT: movzwl D14(%rip), %r13d
86+
; CHECK-NEXT: movzwl D15(%rip), %eax
87+
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
88+
; CHECK-NEXT: movzwl D16(%rip), %eax
89+
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
90+
; CHECK-NEXT: movzwl D17(%rip), %eax
91+
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
92+
; CHECK-NEXT: movzwl D18(%rip), %eax
93+
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
94+
; CHECK-NEXT: #APP
95+
; CHECK-NEXT: #NO_APP
96+
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
97+
; CHECK-NEXT: movw %ax, U0(%rip)
98+
; CHECK-NEXT: movw %cx, U1(%rip)
99+
; CHECK-NEXT: movw %dx, U2(%rip)
100+
; CHECK-NEXT: movw %si, U3(%rip)
101+
; CHECK-NEXT: movw %di, U4(%rip)
102+
; CHECK-NEXT: movw %r8w, U5(%rip)
103+
; CHECK-NEXT: movw %r9w, U6(%rip)
104+
; CHECK-NEXT: movw %r10w, U7(%rip)
105+
; CHECK-NEXT: movw %r11w, U8(%rip)
106+
; CHECK-NEXT: movw %bx, U9(%rip)
107+
; CHECK-NEXT: movw %bp, U10(%rip)
108+
; CHECK-NEXT: movw %r14w, U11(%rip)
109+
; CHECK-NEXT: movw %r15w, U12(%rip)
110+
; CHECK-NEXT: movw %r12w, U13(%rip)
111+
; CHECK-NEXT: movw %r13w, U14(%rip)
112+
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
113+
; CHECK-NEXT: movw %ax, U15(%rip)
114+
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
115+
; CHECK-NEXT: movw %ax, U16(%rip)
116+
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
117+
; CHECK-NEXT: movw %ax, U17(%rip)
118+
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
119+
; CHECK-NEXT: movw %ax, U18(%rip)
120+
; CHECK-NEXT: popq %rbx
121+
; CHECK-NEXT: .cfi_def_cfa_offset 48
122+
; CHECK-NEXT: popq %r12
123+
; CHECK-NEXT: .cfi_def_cfa_offset 40
124+
; CHECK-NEXT: popq %r13
125+
; CHECK-NEXT: .cfi_def_cfa_offset 32
126+
; CHECK-NEXT: popq %r14
127+
; CHECK-NEXT: .cfi_def_cfa_offset 24
128+
; CHECK-NEXT: popq %r15
129+
; CHECK-NEXT: .cfi_def_cfa_offset 16
130+
; CHECK-NEXT: popq %rbp
131+
; CHECK-NEXT: .cfi_def_cfa_offset 8
132+
; CHECK-NEXT: retq
133+
entry:
134+
%0 = load i16, i16* @D0
135+
%1 = load i16, i16* @D1
136+
%2 = load i16, i16* @D2
137+
%3 = load i16, i16* @D3
138+
%4 = load i16, i16* @D4
139+
%5 = load i16, i16* @D5
140+
%6 = load i16, i16* @D6
141+
%7 = load i16, i16* @D7
142+
%8 = load i16, i16* @D8
143+
%9 = load i16, i16* @D9
144+
%10 = load i16, i16* @D10
145+
%11 = load i16, i16* @D11
146+
%12 = load i16, i16* @D12
147+
%13 = load i16, i16* @D13
148+
%14 = load i16, i16* @D14
149+
%15 = load i16, i16* @D15
150+
%16 = load i16, i16* @D16
151+
%17 = load i16, i16* @D17
152+
%18 = load i16, i16* @D18
153+
call void asm sideeffect "", "~{memory}"() #1
154+
store i16 %0, i16* @U0
155+
store i16 %1, i16* @U1
156+
store i16 %2, i16* @U2
157+
store i16 %3, i16* @U3
158+
store i16 %4, i16* @U4
159+
store i16 %5, i16* @U5
160+
store i16 %6, i16* @U6
161+
store i16 %7, i16* @U7
162+
store i16 %8, i16* @U8
163+
store i16 %9, i16* @U9
164+
store i16 %10, i16* @U10
165+
store i16 %11, i16* @U11
166+
store i16 %12, i16* @U12
167+
store i16 %13, i16* @U13
168+
store i16 %14, i16* @U14
169+
store i16 %15, i16* @U15
170+
store i16 %16, i16* @U16
171+
store i16 %17, i16* @U17
172+
store i16 %18, i16* @U18
173+
ret void
174+
}
175+
176+
attributes #0 = { mustprogress noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
177+
attributes #1 = { nounwind }

0 commit comments

Comments
 (0)