Skip to content

Commit 19367bb

Browse files
epilkslinder1RamNalamothu
committed
[AMDGPU] Implement CFI for non-kernel functions
This does not implement CSR spills other than those AMDGPU handles during PEI. The remaining spills are handled in a subsequent patch. Co-authored-by: Scott Linder <[email protected]> Co-authored-by: Venkata Ramanaiah Nalamothu <[email protected]>
1 parent 152d9fc commit 19367bb

File tree

87 files changed

+25401
-1274
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

87 files changed

+25401
-1274
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 291 additions & 13 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AMDGPU/SIFrameLowering.h

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
3939
void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB,
4040
MachineBasicBlock::iterator MBBI, DebugLoc &DL,
4141
LiveRegUnits &LiveUnits, Register FrameReg,
42-
Register FramePtrRegScratchCopy) const;
42+
Register FramePtrRegScratchCopy,
43+
const bool NeedsFrameMoves) const;
4344
void emitCSRSpillRestores(MachineFunction &MF, MachineBasicBlock &MBB,
4445
MachineBasicBlock::iterator MBBI, DebugLoc &DL,
4546
LiveRegUnits &LiveUnits, Register FrameReg,
@@ -101,6 +102,15 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
101102
Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
102103
Register ScratchWaveOffsetReg) const;
103104

105+
void emitPrologueEntryCFI(MachineBasicBlock &MBB,
106+
MachineBasicBlock::iterator MBBI,
107+
const DebugLoc &DL) const;
108+
109+
void emitDefCFA(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
110+
DebugLoc const &DL, Register StackPtrReg,
111+
bool AspaceAlreadyDefined,
112+
MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const;
113+
104114
public:
105115
bool requiresStackPointerReference(const MachineFunction &MF) const;
106116

@@ -110,6 +120,24 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
110120
const DebugLoc &DL, const MCCFIInstruction &CFIInst,
111121
MachineInstr::MIFlag flag = MachineInstr::FrameSetup) const;
112122

123+
/// Create a CFI index describing a spill of an SGPR to a single lane of
124+
/// a VGPR and build a MachineInstr around it.
125+
MachineInstr *buildCFIForSGPRToVGPRSpill(MachineBasicBlock &MBB,
126+
MachineBasicBlock::iterator MBBI,
127+
const DebugLoc &DL,
128+
const Register SGPR,
129+
const Register VGPR,
130+
const int Lane) const;
131+
/// Create a CFI index describing a spill of an SGPR to multiple lanes of
132+
/// VGPRs and build a MachineInstr around it.
133+
MachineInstr *buildCFIForSGPRToVGPRSpill(
134+
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
135+
const DebugLoc &DL, Register SGPR,
136+
ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills) const;
137+
MachineInstr *buildCFIForRegToSGPRPairSpill(MachineBasicBlock &MBB,
138+
MachineBasicBlock::iterator MBBI,
139+
const DebugLoc &DL, Register Reg,
140+
Register SGPRPair) const;
113141
// Returns true if the function may need to reserve space on the stack for the
114142
// CWSR trap handler.
115143
bool mayReserveScratchForCWSR(const MachineFunction &MF) const;

llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -222,11 +222,11 @@ define void @func_caller_stack() {
222222
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
223223
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
224224
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
225+
; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
225226
; MUBUF-NEXT: s_addk_i32 s32, 0x400
226227
; MUBUF-NEXT: v_mov_b32_e32 v0, 9
227228
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
228229
; MUBUF-NEXT: v_mov_b32_e32 v0, 10
229-
; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
230230
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
231231
; MUBUF-NEXT: v_mov_b32_e32 v0, 11
232232
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
@@ -257,8 +257,8 @@ define void @func_caller_stack() {
257257
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
258258
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
259259
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
260-
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
261260
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
261+
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
262262
; FLATSCR-NEXT: s_add_u32 s0, s32, 4
263263
; FLATSCR-NEXT: v_mov_b32_e32 v0, 9
264264
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
@@ -300,10 +300,10 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
300300
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
301301
; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
302302
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
303+
; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
303304
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
304305
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
305306
; MUBUF-NEXT: s_addk_i32 s32, 0x400
306-
; MUBUF-NEXT: v_writelane_b32 v40, s4, 2
307307
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
308308
; MUBUF-NEXT: s_getpc_b64 s[4:5]
309309
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
@@ -382,9 +382,9 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
382382
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
383383
; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
384384
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
385+
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
385386
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off
386387
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
387-
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
388388
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
389389
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
390390
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4

llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,6 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
363363
; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
364364
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
365365
; GFX9-NEXT: v_mov_b32_e32 v0, 0
366-
; GFX9-NEXT: s_mov_b32 s33, s6
367366
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
368367
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
369368
; GFX9-NEXT: s_add_u32 s5, s32, 0x7ff
@@ -377,6 +376,7 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
377376
; GFX9-NEXT: s_add_u32 s32, s5, s4
378377
; GFX9-NEXT: s_mov_b32 s32, s34
379378
; GFX9-NEXT: s_mov_b32 s34, s7
379+
; GFX9-NEXT: s_mov_b32 s33, s6
380380
; GFX9-NEXT: s_waitcnt vmcnt(0)
381381
; GFX9-NEXT: s_setpc_b64 s[30:31]
382382
;
@@ -394,7 +394,6 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
394394
; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
395395
; GFX10-NEXT: v_mov_b32_e32 v0, 0
396396
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
397-
; GFX10-NEXT: s_mov_b32 s33, s6
398397
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
399398
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
400399
; GFX10-NEXT: s_add_u32 s5, s32, 0x3ff
@@ -408,6 +407,7 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
408407
; GFX10-NEXT: s_add_u32 s32, s5, s4
409408
; GFX10-NEXT: s_mov_b32 s32, s34
410409
; GFX10-NEXT: s_mov_b32 s34, s7
410+
; GFX10-NEXT: s_mov_b32 s33, s6
411411
; GFX10-NEXT: s_setpc_b64 s[30:31]
412412
;
413413
; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32:
@@ -424,7 +424,6 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
424424
; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
425425
; GFX11-NEXT: v_mov_b32_e32 v0, 0
426426
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
427-
; GFX11-NEXT: s_mov_b32 s33, s2
428427
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
429428
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
430429
; GFX11-NEXT: s_add_u32 s1, s32, 0x3ff
@@ -439,6 +438,7 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
439438
; GFX11-NEXT: s_add_u32 s32, s1, s0
440439
; GFX11-NEXT: s_mov_b32 s32, s34
441440
; GFX11-NEXT: s_mov_b32 s34, s3
441+
; GFX11-NEXT: s_mov_b32 s33, s2
442442
; GFX11-NEXT: s_setpc_b64 s[30:31]
443443
%n = load i32, ptr addrspace(4) @gv
444444
%alloca = alloca i32, i32 %n, align 32, addrspace(5)

llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,11 +235,11 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) {
235235
; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1
236236
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
237237
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
238+
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
238239
; GFX9-NEXT: v_mov_b32_e32 v0, 0
239240
; GFX9-NEXT: v_mov_b32_e32 v1, 0
240241
; GFX9-NEXT: global_load_dword v0, v[0:1], off glc
241242
; GFX9-NEXT: s_waitcnt vmcnt(0)
242-
; GFX9-NEXT: v_writelane_b32 v40, s16, 2
243243
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
244244
; GFX9-NEXT: s_addk_i32 s32, 0x400
245245
; GFX9-NEXT: v_writelane_b32 v40, s31, 1

llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,8 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
151151
; GCN: ; %bb.0: ; %entry
152152
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153153
; GCN-NEXT: s_mov_b32 s7, s33
154-
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
155154
; GCN-NEXT: s_mov_b32 s33, s32
155+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
156156
; GCN-NEXT: s_addk_i32 s32, 0x400
157157
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
158158
; GCN-NEXT: s_cbranch_execz .LBB2_3
@@ -217,9 +217,9 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
217217
; GCN-NEXT: s_mov_b32 s7, s33
218218
; GCN-NEXT: s_add_i32 s33, s32, 0xfc0
219219
; GCN-NEXT: s_mov_b32 s8, s34
220-
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
221220
; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000
222221
; GCN-NEXT: s_mov_b32 s34, s32
222+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
223223
; GCN-NEXT: s_addk_i32 s32, 0x2000
224224
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
225225
; GCN-NEXT: s_cbranch_execz .LBB3_2

0 commit comments

Comments
 (0)