Skip to content

Commit 7a4e635

Browse files
author
Vasileios Porpodas
committed
[Spill2Reg] Add live register tracking
This patch implements tracking of live registers. This is used to look for free vector registers. It works by walking up the CFG from the reloads all the way to the spills, accumulating the register units being used. This implementation caches the live register units used by each MBB for faster compilation time. Note: Live register tracking relies on MBB liveins/outs being maintained correctly, which is implemented in a follow-up patch. So this patch will still not generate correct code for all but some simple cases. Original review: https://reviews.llvm.org/D118303
1 parent 768eca5 commit 7a4e635

12 files changed

+1168
-31
lines changed

llvm/lib/CodeGen/Spill2Reg.cpp

Lines changed: 128 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@ class Spill2Reg : public MachineFunctionPass {
9898
/// Helper for generateCode(). It eplaces stack spills or reloads with movs
9999
/// to \p LI.reg().
100100
void replaceStackWithReg(StackSlotDataEntry &Entry, Register VectorReg);
101+
/// Updates the live-ins of MBBs after we emit the new spill2reg instructions
102+
/// and the vector registers become live from register spills to reloads.
103+
void updateLiveIns(StackSlotDataEntry &Entry, MCRegister VectorReg);
101104
/// Updates \p LRU with the liveness of physical registers around the spills
102105
/// and reloads in \p Entry.
103106
void calculateLiveRegs(StackSlotDataEntry &Entry, LiveRegUnits &LRU);
@@ -110,6 +113,9 @@ class Spill2Reg : public MachineFunctionPass {
110113

111114
/// Map from a stack slot to the corresponding spills and reloads.
112115
DenseMap<int, StackSlotDataEntry> StackSlotData;
116+
/// The registers used by each block (from LiveRegUnits). This is needed for
117+
/// finding free physical registers in the generateCode().
118+
DenseMap<const MachineBasicBlock *, LiveRegUnits> LRUs;
113119

114120
MachineFunction *MF = nullptr;
115121
MachineRegisterInfo *MRI = nullptr;
@@ -168,7 +174,16 @@ void Spill2Reg::collectSpillsAndReloads() {
168174
// If any spill/reload for a stack slot is found not to be eligible for
169175
// spill-to-reg, then that stack slot is disabled.
170176
for (MachineBasicBlock &MBB : *MF) {
171-
for (MachineInstr &MI : MBB) {
177+
// Initialize AccumMBBLRU for keeping track of physical registers used
178+
// across the whole MBB.
179+
LiveRegUnits AccumMBBLRU(*TRI);
180+
AccumMBBLRU.addLiveOuts(MBB);
181+
182+
// Collect spills/reloads
183+
for (MachineInstr &MI : llvm::reverse(MBB)) {
184+
// Update the LRU state as we move upwards.
185+
AccumMBBLRU.accumulate(MI);
186+
172187
int StackSlot;
173188
if (const MachineOperand *MO = TII->isStoreToStackSlotMO(MI, StackSlot)) {
174189
MachineInstr *Spill = &MI;
@@ -202,6 +217,8 @@ void Spill2Reg::collectSpillsAndReloads() {
202217
}
203218
}
204219
}
220+
221+
LRUs.insert(std::make_pair(&MBB, AccumMBBLRU));
205222
}
206223
}
207224

@@ -227,6 +244,26 @@ Spill2Reg::tryGetFreePhysicalReg(const TargetRegisterClass *RegClass,
227244
return std::nullopt;
228245
}
229246

247+
/// Perform a bottom-up depth-first traversal from \p MBB at \p MI towards its
248+
/// predecessors blocks. Visited marks the visited blocks. \p Fn is the
249+
/// callback function called in pre-order. If \p Fn returns true we stop the
250+
/// traversal.
251+
// TODO: Use df_iterator
252+
static void DFS(MachineBasicBlock *MBB, DenseSet<MachineBasicBlock *> &Visited,
253+
std::function<bool(MachineBasicBlock *)> Fn) {
254+
// Skip visited to avoid infinite loops.
255+
if (Visited.count(MBB))
256+
return;
257+
Visited.insert(MBB);
258+
259+
// Preorder.
260+
if (Fn(MBB))
261+
return;
262+
263+
// Depth-first across predecessors.
264+
for (MachineBasicBlock *PredMBB : MBB->predecessors())
265+
DFS(PredMBB, Visited, Fn);
266+
}
230267
// Replace stack-based spills/reloads with register-based ones.
231268
void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
232269
Register VectorReg) {
@@ -239,6 +276,9 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
239276
VectorReg, OldReg, SpillData.SpillBits, StackSpill->getParent(),
240277
/*InsertBeforeIt=*/StackSpill->getIterator(), TRI);
241278

279+
// Mark VectorReg as live in the instr's BB.
280+
LRUs[StackSpill->getParent()].addReg(VectorReg);
281+
242282
// Spill to stack is no longer needed.
243283
StackSpill->eraseFromParent();
244284
assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
@@ -253,6 +293,9 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
253293
OldReg, VectorReg, ReloadData.SpillBits, StackReload->getParent(),
254294
/*InsertBeforeIt=*/StackReload->getIterator(), TRI);
255295

296+
// Mark VectorReg as live in the instr's BB.
297+
LRUs[StackReload->getParent()].addReg(VectorReg);
298+
256299
// Reload from stack is no longer needed.
257300
StackReload->eraseFromParent();
258301
assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
@@ -261,7 +304,86 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
261304

262305
void Spill2Reg::calculateLiveRegs(StackSlotDataEntry &Entry,
263306
LiveRegUnits &LRU) {
264-
// TODO: Unimplemented
307+
// Collect the parent MBBs of Spills for fast lookup.
308+
DenseSet<MachineBasicBlock *> SpillMBBs(Entry.Spills.size());
309+
DenseSet<MachineInstr *> Spills(Entry.Spills.size());
310+
for (const auto &Data : Entry.Spills) {
311+
SpillMBBs.insert(Data.MI->getParent());
312+
Spills.insert(Data.MI);
313+
}
314+
315+
/// Walks up the instructions in \p Reload's block, stopping at a spill if
316+
/// found. \Returns true if a spill was found, false otherwise.
317+
auto AccumulateLRUUntilSpillFn = [&Spills, &SpillMBBs](MachineInstr *Reload,
318+
LiveRegUnits &LRU) {
319+
MachineBasicBlock *MBB = Reload->getParent();
320+
bool IsSpillBlock = SpillMBBs.count(MBB);
321+
// Add all MBB's live-outs.
322+
LRU.addLiveOuts(*MBB);
323+
// Else walk up the BB, starting from MI, looking for any spill.
324+
for (MachineInstr *CurrMI = Reload; CurrMI != nullptr;
325+
CurrMI = CurrMI->getPrevNode()) {
326+
LRU.accumulate(*CurrMI);
327+
// If a spill is found then return true to end the recursion.
328+
if (IsSpillBlock && Spills.count(CurrMI))
329+
return true;
330+
}
331+
return false;
332+
};
333+
334+
// Helper for the traversal. It accumulates all register units used in \p
335+
// MBB from \p MI upwards. It returns true once a spill is found.
336+
auto AccumulateLRUFn = [&SpillMBBs, &LRU, AccumulateLRUUntilSpillFn,
337+
this](MachineBasicBlock *MBB) {
338+
if (SpillMBBs.count(MBB)) {
339+
// If this is a spill block, then walk bottom-up until the spill.
340+
assert(!MBB->empty() && "How can it be a spill block and empty?");
341+
bool FoundSpill = AccumulateLRUUntilSpillFn(&*MBB->rbegin(), LRU);
342+
assert(FoundSpill && "Spill block but we couldn't find spill!");
343+
// We return true to stop the recursion.
344+
return true;
345+
}
346+
// Else this is an intermediate block between the spills and reloads and
347+
// there is no spill in it, then use the pre-computed LRU to avoid walking
348+
// it again. This improves compilation time.
349+
LRU.addUnits(LRUs[MBB].getBitVector());
350+
// We return false to continue the recursion.
351+
return false;
352+
};
353+
354+
/// \Returns the LiveRegUnits at `Reload` by stepping back the BB.
355+
auto GetReloadLRU = [this](MachineInstr *Reload) {
356+
LiveRegUnits ReloadLRU(*TRI);
357+
MachineBasicBlock *MBB = Reload->getParent();
358+
ReloadLRU.addLiveOuts(*MBB);
359+
// Start at the bottom of the BB and walk up until we find `Reload`.
360+
for (MachineInstr &MI : llvm::reverse(*MBB)) {
361+
if (&MI == Reload)
362+
break;
363+
// TODO: Check if this should be accumulate() instead of stepBackward().
364+
ReloadLRU.stepBackward(MI);
365+
}
366+
return ReloadLRU;
367+
};
368+
369+
// Start from each Reload and walk up the CFG with a depth-first traversal,
370+
// looking for spills. Upon finding a spill we don't go beyond that point. In
371+
// the meantime we accumulate the registers used. This is then used to find
372+
// free physical registes.
373+
DenseSet<MachineBasicBlock *> Visited;
374+
for (const auto &ReloadData : Entry.Reloads) {
375+
MachineInstr *Reload = ReloadData.MI;
376+
// Add the Reload's LRU to the total LRU for the whole Spill-Reload range.
377+
LiveRegUnits ReloadLRU = GetReloadLRU(Reload);
378+
bool FoundSpill = AccumulateLRUUntilSpillFn(Reload, ReloadLRU);
379+
LRU.addUnits(ReloadLRU.getBitVector());
380+
381+
// Traverse the CFG bottom-up accumulating LRUs until we reach the Spills.
382+
if (!FoundSpill) {
383+
for (MachineBasicBlock *PredMBB : Reload->getParent()->predecessors())
384+
DFS(PredMBB, Visited, AccumulateLRUFn);
385+
}
386+
}
265387
}
266388

267389
void Spill2Reg::generateCode() {
@@ -292,7 +414,10 @@ void Spill2Reg::generateCode() {
292414
}
293415
}
294416

295-
void Spill2Reg::cleanup() { StackSlotData.clear(); }
417+
void Spill2Reg::cleanup() {
418+
StackSlotData.clear();
419+
LRUs.clear();
420+
}
296421

297422
bool Spill2Reg::run() {
298423
// Walk over each instruction in the code keeping track of the processor's
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
3+
4+
; End-to-end check that Spill2Reg works with 16-bit registers.
5+
6+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
7+
target triple = "x86_64-unknown-linux-gnu"
8+
9+
@D0 = dso_local local_unnamed_addr global i16 0, align 4
10+
@D1 = dso_local local_unnamed_addr global i16 0, align 4
11+
@D2 = dso_local local_unnamed_addr global i16 0, align 4
12+
@D3 = dso_local local_unnamed_addr global i16 0, align 4
13+
@D4 = dso_local local_unnamed_addr global i16 0, align 4
14+
@D5 = dso_local local_unnamed_addr global i16 0, align 4
15+
@D6 = dso_local local_unnamed_addr global i16 0, align 4
16+
@D7 = dso_local local_unnamed_addr global i16 0, align 4
17+
@D8 = dso_local local_unnamed_addr global i16 0, align 4
18+
@D9 = dso_local local_unnamed_addr global i16 0, align 4
19+
@D10 = dso_local local_unnamed_addr global i16 0, align 4
20+
@D11 = dso_local local_unnamed_addr global i16 0, align 4
21+
@D12 = dso_local local_unnamed_addr global i16 0, align 4
22+
@D13 = dso_local local_unnamed_addr global i16 0, align 4
23+
@D14 = dso_local local_unnamed_addr global i16 0, align 4
24+
@D15 = dso_local local_unnamed_addr global i16 0, align 4
25+
@D16 = dso_local local_unnamed_addr global i16 0, align 4
26+
@D17 = dso_local local_unnamed_addr global i16 0, align 4
27+
@D18 = dso_local local_unnamed_addr global i16 0, align 4
28+
@U0 = dso_local local_unnamed_addr global i16 0, align 4
29+
@U1 = dso_local local_unnamed_addr global i16 0, align 4
30+
@U2 = dso_local local_unnamed_addr global i16 0, align 4
31+
@U3 = dso_local local_unnamed_addr global i16 0, align 4
32+
@U4 = dso_local local_unnamed_addr global i16 0, align 4
33+
@U5 = dso_local local_unnamed_addr global i16 0, align 4
34+
@U6 = dso_local local_unnamed_addr global i16 0, align 4
35+
@U7 = dso_local local_unnamed_addr global i16 0, align 4
36+
@U8 = dso_local local_unnamed_addr global i16 0, align 4
37+
@U9 = dso_local local_unnamed_addr global i16 0, align 4
38+
@U10 = dso_local local_unnamed_addr global i16 0, align 4
39+
@U11 = dso_local local_unnamed_addr global i16 0, align 4
40+
@U12 = dso_local local_unnamed_addr global i16 0, align 4
41+
@U13 = dso_local local_unnamed_addr global i16 0, align 4
42+
@U14 = dso_local local_unnamed_addr global i16 0, align 4
43+
@U15 = dso_local local_unnamed_addr global i16 0, align 4
44+
@U16 = dso_local local_unnamed_addr global i16 0, align 4
45+
@U17 = dso_local local_unnamed_addr global i16 0, align 4
46+
@U18 = dso_local local_unnamed_addr global i16 0, align 4
47+
48+
; Function Attrs: mustprogress noinline nounwind uwtable
49+
define dso_local void @_Z5spillv() local_unnamed_addr #0 {
50+
; CHECK-LABEL: _Z5spillv:
51+
; CHECK: # %bb.0: # %entry
52+
; CHECK-NEXT: pushq %rbp
53+
; CHECK-NEXT: .cfi_def_cfa_offset 16
54+
; CHECK-NEXT: pushq %r15
55+
; CHECK-NEXT: .cfi_def_cfa_offset 24
56+
; CHECK-NEXT: pushq %r14
57+
; CHECK-NEXT: .cfi_def_cfa_offset 32
58+
; CHECK-NEXT: pushq %r13
59+
; CHECK-NEXT: .cfi_def_cfa_offset 40
60+
; CHECK-NEXT: pushq %r12
61+
; CHECK-NEXT: .cfi_def_cfa_offset 48
62+
; CHECK-NEXT: pushq %rbx
63+
; CHECK-NEXT: .cfi_def_cfa_offset 56
64+
; CHECK-NEXT: .cfi_offset %rbx, -56
65+
; CHECK-NEXT: .cfi_offset %r12, -48
66+
; CHECK-NEXT: .cfi_offset %r13, -40
67+
; CHECK-NEXT: .cfi_offset %r14, -32
68+
; CHECK-NEXT: .cfi_offset %r15, -24
69+
; CHECK-NEXT: .cfi_offset %rbp, -16
70+
; CHECK-NEXT: movzwl D0(%rip), %eax
71+
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
72+
; CHECK-NEXT: movzwl D1(%rip), %ecx
73+
; CHECK-NEXT: movzwl D2(%rip), %edx
74+
; CHECK-NEXT: movzwl D3(%rip), %esi
75+
; CHECK-NEXT: movzwl D4(%rip), %edi
76+
; CHECK-NEXT: movzwl D5(%rip), %r8d
77+
; CHECK-NEXT: movzwl D6(%rip), %r9d
78+
; CHECK-NEXT: movzwl D7(%rip), %r10d
79+
; CHECK-NEXT: movzwl D8(%rip), %r11d
80+
; CHECK-NEXT: movzwl D9(%rip), %ebx
81+
; CHECK-NEXT: movzwl D10(%rip), %ebp
82+
; CHECK-NEXT: movzwl D11(%rip), %r14d
83+
; CHECK-NEXT: movzwl D12(%rip), %r15d
84+
; CHECK-NEXT: movzwl D13(%rip), %r12d
85+
; CHECK-NEXT: movzwl D14(%rip), %r13d
86+
; CHECK-NEXT: movzwl D15(%rip), %eax
87+
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
88+
; CHECK-NEXT: movzwl D16(%rip), %eax
89+
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
90+
; CHECK-NEXT: movzwl D17(%rip), %eax
91+
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
92+
; CHECK-NEXT: movzwl D18(%rip), %eax
93+
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
94+
; CHECK-NEXT: #APP
95+
; CHECK-NEXT: #NO_APP
96+
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
97+
; CHECK-NEXT: movw %ax, U0(%rip)
98+
; CHECK-NEXT: movw %cx, U1(%rip)
99+
; CHECK-NEXT: movw %dx, U2(%rip)
100+
; CHECK-NEXT: movw %si, U3(%rip)
101+
; CHECK-NEXT: movw %di, U4(%rip)
102+
; CHECK-NEXT: movw %r8w, U5(%rip)
103+
; CHECK-NEXT: movw %r9w, U6(%rip)
104+
; CHECK-NEXT: movw %r10w, U7(%rip)
105+
; CHECK-NEXT: movw %r11w, U8(%rip)
106+
; CHECK-NEXT: movw %bx, U9(%rip)
107+
; CHECK-NEXT: movw %bp, U10(%rip)
108+
; CHECK-NEXT: movw %r14w, U11(%rip)
109+
; CHECK-NEXT: movw %r15w, U12(%rip)
110+
; CHECK-NEXT: movw %r12w, U13(%rip)
111+
; CHECK-NEXT: movw %r13w, U14(%rip)
112+
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
113+
; CHECK-NEXT: movw %ax, U15(%rip)
114+
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
115+
; CHECK-NEXT: movw %ax, U16(%rip)
116+
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
117+
; CHECK-NEXT: movw %ax, U17(%rip)
118+
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
119+
; CHECK-NEXT: movw %ax, U18(%rip)
120+
; CHECK-NEXT: popq %rbx
121+
; CHECK-NEXT: .cfi_def_cfa_offset 48
122+
; CHECK-NEXT: popq %r12
123+
; CHECK-NEXT: .cfi_def_cfa_offset 40
124+
; CHECK-NEXT: popq %r13
125+
; CHECK-NEXT: .cfi_def_cfa_offset 32
126+
; CHECK-NEXT: popq %r14
127+
; CHECK-NEXT: .cfi_def_cfa_offset 24
128+
; CHECK-NEXT: popq %r15
129+
; CHECK-NEXT: .cfi_def_cfa_offset 16
130+
; CHECK-NEXT: popq %rbp
131+
; CHECK-NEXT: .cfi_def_cfa_offset 8
132+
; CHECK-NEXT: retq
133+
entry:
134+
%0 = load i16, i16* @D0
135+
%1 = load i16, i16* @D1
136+
%2 = load i16, i16* @D2
137+
%3 = load i16, i16* @D3
138+
%4 = load i16, i16* @D4
139+
%5 = load i16, i16* @D5
140+
%6 = load i16, i16* @D6
141+
%7 = load i16, i16* @D7
142+
%8 = load i16, i16* @D8
143+
%9 = load i16, i16* @D9
144+
%10 = load i16, i16* @D10
145+
%11 = load i16, i16* @D11
146+
%12 = load i16, i16* @D12
147+
%13 = load i16, i16* @D13
148+
%14 = load i16, i16* @D14
149+
%15 = load i16, i16* @D15
150+
%16 = load i16, i16* @D16
151+
%17 = load i16, i16* @D17
152+
%18 = load i16, i16* @D18
153+
call void asm sideeffect "", "~{memory}"() #1
154+
store i16 %0, i16* @U0
155+
store i16 %1, i16* @U1
156+
store i16 %2, i16* @U2
157+
store i16 %3, i16* @U3
158+
store i16 %4, i16* @U4
159+
store i16 %5, i16* @U5
160+
store i16 %6, i16* @U6
161+
store i16 %7, i16* @U7
162+
store i16 %8, i16* @U8
163+
store i16 %9, i16* @U9
164+
store i16 %10, i16* @U10
165+
store i16 %11, i16* @U11
166+
store i16 %12, i16* @U12
167+
store i16 %13, i16* @U13
168+
store i16 %14, i16* @U14
169+
store i16 %15, i16* @U15
170+
store i16 %16, i16* @U16
171+
store i16 %17, i16* @U17
172+
store i16 %18, i16* @U18
173+
ret void
174+
}
175+
176+
attributes #0 = { mustprogress noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
177+
attributes #1 = { nounwind }

0 commit comments

Comments
 (0)