Skip to content

Commit 23dd0e8

Browse files
easyonaaditPravin Jagtaparsenm
authored andcommitted
[AMDGPU] Restore SP from saved-FP or saved-BP (llvm#124007)
Currently, the AMDGPU backend bumps the Stack Pointer by fixed size offsets in the prolog of device functions, and restores it by the same amount in the epilog. Prolog: sp += frameSize Epilog: sp -= frameSize If a function has dynamic stack realignment, Prolog: sp += frameSize + max_alignment Epilog: sp -= frameSize + max_alignment These calculations are not optimal in case of dynamic stack realignment, and completely fail in case of dynamic stack readjustment. This patch uses the saved Frame Pointer to restore SP. Prolog: fp = sp sp += frameSize Epilog: sp = fp In case of dynamic stack realignment, SP is restored from the saved Base Pointer. Prolog: fp = sp + (max_alignment - 1) fp = fp & (-max_alignment) bp = sp sp += frameSize + max_alignment Epilog: sp = bp (Note: The presence of BP has been enforced in case of any dynamic stack realignment.) --------- Co-authored-by: Pravin Jagtap <[email protected]> Co-authored-by: Matt Arsenault <[email protected]>
1 parent 46dcee6 commit 23dd0e8

File tree

63 files changed

+1329
-1046
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+1329
-1046
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,6 +1512,16 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
15121512
const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
15131513
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
15141514
bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1515+
if (RoundedSize != 0) {
1516+
if (TRI.hasBasePointer(MF))
1517+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1518+
.addReg(TRI.getBaseRegister())
1519+
.setMIFlag(MachineInstr::FrameDestroy);
1520+
else if (hasFP(MF))
1521+
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1522+
.addReg(FramePtrReg)
1523+
.setMIFlag(MachineInstr::FrameDestroy);
1524+
}
15151525

15161526
Register FramePtrRegScratchCopy;
15171527
Register SGPRForFPSaveRestoreCopy =
@@ -1537,14 +1547,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
15371547
FramePtrRegScratchCopy);
15381548
}
15391549

1540-
if (RoundedSize != 0 && hasFP(MF)) {
1541-
auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1542-
.addReg(StackPtrReg)
1543-
.addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1544-
.setMIFlag(MachineInstr::FrameDestroy);
1545-
Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1546-
}
1547-
15481550
// FIXME: Switch to using MF.needsFrameMoves() later
15491551
const bool NeedsFrameMoves = true;
15501552
if (hasFP(MF)) {

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -530,8 +530,7 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
530530
bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
531531
// When we need stack realignment, we can't reference off of the
532532
// stack pointer, so we reserve a base pointer.
533-
const MachineFrameInfo &MFI = MF.getFrameInfo();
534-
return MFI.getNumFixedObjects() && shouldRealignStack(MF);
533+
return shouldRealignStack(MF);
535534
}
536535

537536
Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }

llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ define ptr addrspace(1) @call_assert_align() {
2727
; CHECK-NEXT: global_store_dword v[0:1], v2, off
2828
; CHECK-NEXT: s_waitcnt vmcnt(0)
2929
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
30+
; CHECK-NEXT: s_mov_b32 s32, s33
3031
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
3132
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
3233
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
3334
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
34-
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
3535
; CHECK-NEXT: s_mov_b32 s33, s4
3636
; CHECK-NEXT: s_waitcnt vmcnt(0)
3737
; CHECK-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,11 +247,11 @@ define void @func_caller_stack() {
247247
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
248248
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
249249
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
250+
; MUBUF-NEXT: s_mov_b32 s32, s33
250251
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
251252
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
252253
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
253254
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
254-
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
255255
; MUBUF-NEXT: s_mov_b32 s33, s4
256256
; MUBUF-NEXT: s_waitcnt vmcnt(0)
257257
; MUBUF-NEXT: s_setpc_b64 s[30:31]
@@ -286,11 +286,11 @@ define void @func_caller_stack() {
286286
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
287287
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
288288
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
289+
; FLATSCR-NEXT: s_mov_b32 s32, s33
289290
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
290291
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
291292
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
292293
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
293-
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
294294
; FLATSCR-NEXT: s_mov_b32 s33, s0
295295
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
296296
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
@@ -372,11 +372,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
372372
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
373373
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
374374
; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
375+
; MUBUF-NEXT: s_mov_b32 s32, s33
375376
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
376377
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
377378
; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
378379
; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
379-
; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
380380
; MUBUF-NEXT: s_mov_b32 s33, s4
381381
; MUBUF-NEXT: s_waitcnt vmcnt(0)
382382
; MUBUF-NEXT: s_setpc_b64 s[30:31]
@@ -437,11 +437,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
437437
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
438438
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
439439
; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
440+
; FLATSCR-NEXT: s_mov_b32 s32, s33
440441
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
441442
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
442443
; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
443444
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
444-
; FLATSCR-NEXT: s_add_i32 s32, s32, -16
445445
; FLATSCR-NEXT: s_mov_b32 s33, s0
446446
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
447447
; FLATSCR-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
8585
; GFX9-NEXT: s_and_b32 s4, s4, -16
8686
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
8787
; GFX9-NEXT: s_add_u32 s32, s6, s4
88-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
88+
; GFX9-NEXT: s_mov_b32 s32, s33
8989
; GFX9-NEXT: s_mov_b32 s33, s7
9090
; GFX9-NEXT: s_waitcnt vmcnt(0)
9191
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -111,7 +111,7 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
111111
; GFX10-NEXT: s_and_b32 s4, s4, -16
112112
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
113113
; GFX10-NEXT: s_add_u32 s32, s6, s4
114-
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
114+
; GFX10-NEXT: s_mov_b32 s32, s33
115115
; GFX10-NEXT: s_mov_b32 s33, s7
116116
; GFX10-NEXT: s_setpc_b64 s[30:31]
117117
;
@@ -135,9 +135,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
135135
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
136136
; GFX11-NEXT: s_and_b32 s0, s0, -16
137137
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
138-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
138+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
139139
; GFX11-NEXT: s_add_u32 s32, s2, s0
140-
; GFX11-NEXT: s_add_i32 s32, s32, -16
140+
; GFX11-NEXT: s_mov_b32 s32, s33
141141
; GFX11-NEXT: s_mov_b32 s33, s3
142142
; GFX11-NEXT: s_setpc_b64 s[30:31]
143143
%n = load i32, ptr addrspace(4) @gv, align 4
@@ -226,7 +226,7 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
226226
; GFX9-NEXT: s_and_b32 s4, s4, -16
227227
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
228228
; GFX9-NEXT: s_add_u32 s32, s6, s4
229-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
229+
; GFX9-NEXT: s_mov_b32 s32, s33
230230
; GFX9-NEXT: s_mov_b32 s33, s7
231231
; GFX9-NEXT: s_waitcnt vmcnt(0)
232232
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -252,7 +252,7 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
252252
; GFX10-NEXT: s_and_b32 s4, s4, -16
253253
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
254254
; GFX10-NEXT: s_add_u32 s32, s6, s4
255-
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
255+
; GFX10-NEXT: s_mov_b32 s32, s33
256256
; GFX10-NEXT: s_mov_b32 s33, s7
257257
; GFX10-NEXT: s_setpc_b64 s[30:31]
258258
;
@@ -276,9 +276,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
276276
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
277277
; GFX11-NEXT: s_and_b32 s0, s0, -16
278278
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
279-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
279+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
280280
; GFX11-NEXT: s_add_u32 s32, s2, s0
281-
; GFX11-NEXT: s_add_i32 s32, s32, -16
281+
; GFX11-NEXT: s_mov_b32 s32, s33
282282
; GFX11-NEXT: s_mov_b32 s33, s3
283283
; GFX11-NEXT: s_setpc_b64 s[30:31]
284284
%n = load i32, ptr addrspace(4) @gv, align 16
@@ -355,6 +355,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
355355
; GFX9-NEXT: s_mov_b32 s6, s33
356356
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
357357
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
358+
; GFX9-NEXT: s_mov_b32 s7, s34
359+
; GFX9-NEXT: s_mov_b32 s34, s32
358360
; GFX9-NEXT: s_addk_i32 s32, 0x1000
359361
; GFX9-NEXT: s_getpc_b64 s[4:5]
360362
; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
@@ -372,7 +374,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
372374
; GFX9-NEXT: s_and_b32 s4, s4, -16
373375
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
374376
; GFX9-NEXT: s_add_u32 s32, s5, s4
375-
; GFX9-NEXT: s_addk_i32 s32, 0xf000
377+
; GFX9-NEXT: s_mov_b32 s32, s34
378+
; GFX9-NEXT: s_mov_b32 s34, s7
376379
; GFX9-NEXT: s_mov_b32 s33, s6
377380
; GFX9-NEXT: s_waitcnt vmcnt(0)
378381
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -382,7 +385,9 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
382385
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383386
; GFX10-NEXT: s_mov_b32 s6, s33
384387
; GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
388+
; GFX10-NEXT: s_mov_b32 s7, s34
385389
; GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00
390+
; GFX10-NEXT: s_mov_b32 s34, s32
386391
; GFX10-NEXT: s_addk_i32 s32, 0x800
387392
; GFX10-NEXT: s_getpc_b64 s[4:5]
388393
; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
@@ -400,7 +405,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
400405
; GFX10-NEXT: s_and_b32 s4, s4, -16
401406
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
402407
; GFX10-NEXT: s_add_u32 s32, s5, s4
403-
; GFX10-NEXT: s_addk_i32 s32, 0xf800
408+
; GFX10-NEXT: s_mov_b32 s32, s34
409+
; GFX10-NEXT: s_mov_b32 s34, s7
404410
; GFX10-NEXT: s_mov_b32 s33, s6
405411
; GFX10-NEXT: s_setpc_b64 s[30:31]
406412
;
@@ -409,8 +415,9 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
409415
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410416
; GFX11-NEXT: s_mov_b32 s2, s33
411417
; GFX11-NEXT: s_add_i32 s33, s32, 31
412-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
418+
; GFX11-NEXT: s_mov_b32 s3, s34
413419
; GFX11-NEXT: s_and_not1_b32 s33, s33, 31
420+
; GFX11-NEXT: s_mov_b32 s34, s32
414421
; GFX11-NEXT: s_add_i32 s32, s32, 64
415422
; GFX11-NEXT: s_getpc_b64 s[0:1]
416423
; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4
@@ -429,8 +436,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
429436
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
430437
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
431438
; GFX11-NEXT: s_add_u32 s32, s1, s0
432-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
433-
; GFX11-NEXT: s_addk_i32 s32, 0xffc0
439+
; GFX11-NEXT: s_mov_b32 s32, s34
440+
; GFX11-NEXT: s_mov_b32 s34, s3
434441
; GFX11-NEXT: s_mov_b32 s33, s2
435442
; GFX11-NEXT: s_setpc_b64 s[30:31]
436443
%n = load i32, ptr addrspace(4) @gv

llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,11 +248,11 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) {
248248
; GFX9-NEXT: s_swappc_b64 s[30:31], 0
249249
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
250250
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
251+
; GFX9-NEXT: s_mov_b32 s32, s33
251252
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
252253
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
253254
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
254255
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
255-
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
256256
; GFX9-NEXT: s_mov_b32 s33, s4
257257
; GFX9-NEXT: s_waitcnt vmcnt(0)
258258
; GFX9-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
180180
; GCN-NEXT: v_mov_b32_e32 v0, 0
181181
; GCN-NEXT: global_store_dword v[0:1], v0, off
182182
; GCN-NEXT: s_waitcnt vmcnt(0)
183-
; GCN-NEXT: s_addk_i32 s32, 0xfc00
183+
; GCN-NEXT: s_mov_b32 s32, s33
184184
; GCN-NEXT: s_mov_b32 s33, s7
185185
; GCN-NEXT: s_setpc_b64 s[30:31]
186186

@@ -216,7 +216,9 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
216216
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217217
; GCN-NEXT: s_mov_b32 s7, s33
218218
; GCN-NEXT: s_add_i32 s33, s32, 0xfc0
219+
; GCN-NEXT: s_mov_b32 s8, s34
219220
; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000
221+
; GCN-NEXT: s_mov_b32 s34, s32
220222
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
221223
; GCN-NEXT: s_addk_i32 s32, 0x2000
222224
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -242,7 +244,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
242244
; GCN-NEXT: v_mov_b32_e32 v0, 0
243245
; GCN-NEXT: global_store_dword v[0:1], v0, off
244246
; GCN-NEXT: s_waitcnt vmcnt(0)
245-
; GCN-NEXT: s_addk_i32 s32, 0xe000
247+
; GCN-NEXT: s_mov_b32 s32, s34
248+
; GCN-NEXT: s_mov_b32 s34, s8
246249
; GCN-NEXT: s_mov_b32 s33, s7
247250
; GCN-NEXT: s_setpc_b64 s[30:31]
248251
entry:

llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ define void @parent_func_missing_inputs() #0 {
3232
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
3333
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
3434
; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
35+
; FIXEDABI-NEXT: s_mov_b32 s32, s33
3536
; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2
3637
; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1
3738
; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
3839
; FIXEDABI-NEXT: s_mov_b64 exec, s[6:7]
39-
; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00
4040
; FIXEDABI-NEXT: s_mov_b32 s33, s4
4141
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
4242
; FIXEDABI-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,11 +193,11 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
193193
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
194194
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
195195
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
196+
; CHECK-NEXT: s_mov_b32 s32, s33
196197
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
197198
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
198199
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
199200
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
200-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
201201
; CHECK-NEXT: s_mov_b32 s33, s4
202202
; CHECK-NEXT: s_waitcnt vmcnt(0)
203203
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -329,11 +329,11 @@ define double @test_powr_fast_f64(double %x, double %y) {
329329
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
330330
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
331331
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
332+
; CHECK-NEXT: s_mov_b32 s32, s33
332333
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
333334
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
334335
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
335336
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
336-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
337337
; CHECK-NEXT: s_mov_b32 s33, s4
338338
; CHECK-NEXT: s_waitcnt vmcnt(0)
339339
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -477,11 +477,11 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
477477
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
478478
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
479479
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
480+
; CHECK-NEXT: s_mov_b32 s32, s33
480481
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
481482
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
482483
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
483484
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
484-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
485485
; CHECK-NEXT: s_mov_b32 s33, s4
486486
; CHECK-NEXT: s_waitcnt vmcnt(0)
487487
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -614,11 +614,11 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
614614
; CHECK-NEXT: v_readlane_b32 s36, v42, 2
615615
; CHECK-NEXT: v_readlane_b32 s35, v42, 1
616616
; CHECK-NEXT: v_readlane_b32 s34, v42, 0
617+
; CHECK-NEXT: s_mov_b32 s32, s33
617618
; CHECK-NEXT: v_readlane_b32 s4, v42, 14
618619
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
619620
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
620621
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
621-
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
622622
; CHECK-NEXT: s_mov_b32 s33, s4
623623
; CHECK-NEXT: s_waitcnt vmcnt(0)
624624
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -761,11 +761,11 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
761761
; CHECK-NEXT: v_readlane_b32 s36, v43, 2
762762
; CHECK-NEXT: v_readlane_b32 s35, v43, 1
763763
; CHECK-NEXT: v_readlane_b32 s34, v43, 0
764+
; CHECK-NEXT: s_mov_b32 s32, s33
764765
; CHECK-NEXT: v_readlane_b32 s4, v43, 14
765766
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
766767
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
767768
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
768-
; CHECK-NEXT: s_addk_i32 s32, 0xf800
769769
; CHECK-NEXT: s_mov_b32 s33, s4
770770
; CHECK-NEXT: s_waitcnt vmcnt(0)
771771
; CHECK-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)