Skip to content

Commit 3027b4a

Browse files
authored
[AMDGPU] Fix iterator invalidation during frame lowering (llvm#163952)
I was a bit too eager to remove the SI_WHOLE_WAVE_FUNC_SETUP instruction during prolog emission. Erasing it invalidates MBBI, which in some cases is still needed outside of `emitCSRSpillStores`. Do the erasing at the end of prolog insertion instead.
1 parent ed9c75a commit 3027b4a

File tree

2 files changed

+137
-5
lines changed

2 files changed

+137
-5
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,16 +1034,13 @@ void SIFrameLowering::emitCSRSpillStores(
10341034

10351035
StoreWWMRegisters(WWMCalleeSavedRegs);
10361036
if (FuncInfo->isWholeWaveFunction()) {
1037-
// SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
1038-
// it now. If we have already saved some WWM CSR registers, then the EXEC is
1039-
// already -1 and we don't need to do anything else. Otherwise, set EXEC to
1040-
// -1 here.
1037+
// If we have already saved some WWM CSR registers, then the EXEC is already
1038+
// -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here.
10411039
if (!ScratchExecCopy)
10421040
buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
10431041
/*EnableInactiveLanes*/ true);
10441042
else if (WWMCalleeSavedRegs.empty())
10451043
EnableAllLanes();
1046-
TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
10471044
} else if (ScratchExecCopy) {
10481045
// FIXME: Split block and make terminator.
10491046
BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
@@ -1340,6 +1337,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
13401337
"Needed to save BP but didn't save it anywhere");
13411338

13421339
assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1340+
1341+
if (FuncInfo->isWholeWaveFunction()) {
1342+
// SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose.
1343+
TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
1344+
}
13431345
}
13441346

13451347
void SIFrameLowering::emitEpilogue(MachineFunction &MF,

llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,136 @@ define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) {
767767
ret void
768768
}
769769

770+
define amdgpu_gfx_whole_wave void @realign_stack(i1 %active, i32 %x) {
771+
; DAGISEL-LABEL: realign_stack:
772+
; DAGISEL: ; %bb.0:
773+
; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
774+
; DAGISEL-NEXT: s_wait_expcnt 0x0
775+
; DAGISEL-NEXT: s_wait_samplecnt 0x0
776+
; DAGISEL-NEXT: s_wait_bvhcnt 0x0
777+
; DAGISEL-NEXT: s_wait_kmcnt 0x0
778+
; DAGISEL-NEXT: s_mov_b32 s1, s33
779+
; DAGISEL-NEXT: s_add_co_i32 s33, s32, 0x3ff
780+
; DAGISEL-NEXT: s_wait_alu 0xfffe
781+
; DAGISEL-NEXT: s_and_b32 s33, s33, 0xfffffc00
782+
; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
783+
; DAGISEL-NEXT: s_mov_b32 s2, s34
784+
; DAGISEL-NEXT: s_mov_b32 s34, s32
785+
; DAGISEL-NEXT: s_addk_co_i32 s32, 0x800
786+
; DAGISEL-NEXT: s_wait_storecnt 0x0
787+
; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
788+
; DAGISEL-NEXT: s_wait_storecnt 0x0
789+
; DAGISEL-NEXT: s_wait_alu 0xfffe
790+
; DAGISEL-NEXT: s_mov_b32 s32, s34
791+
; DAGISEL-NEXT: s_mov_b32 s34, s2
792+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
793+
; DAGISEL-NEXT: s_mov_b32 s33, s1
794+
; DAGISEL-NEXT: s_wait_alu 0xfffe
795+
; DAGISEL-NEXT: s_setpc_b64 s[30:31]
796+
;
797+
; GISEL-LABEL: realign_stack:
798+
; GISEL: ; %bb.0:
799+
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
800+
; GISEL-NEXT: s_wait_expcnt 0x0
801+
; GISEL-NEXT: s_wait_samplecnt 0x0
802+
; GISEL-NEXT: s_wait_bvhcnt 0x0
803+
; GISEL-NEXT: s_wait_kmcnt 0x0
804+
; GISEL-NEXT: s_mov_b32 s1, s33
805+
; GISEL-NEXT: s_add_co_i32 s33, s32, 0x3ff
806+
; GISEL-NEXT: s_wait_alu 0xfffe
807+
; GISEL-NEXT: s_and_b32 s33, s33, 0xfffffc00
808+
; GISEL-NEXT: s_xor_saveexec_b32 s0, -1
809+
; GISEL-NEXT: s_mov_b32 s2, s34
810+
; GISEL-NEXT: s_mov_b32 s34, s32
811+
; GISEL-NEXT: s_addk_co_i32 s32, 0x800
812+
; GISEL-NEXT: s_wait_storecnt 0x0
813+
; GISEL-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
814+
; GISEL-NEXT: s_wait_storecnt 0x0
815+
; GISEL-NEXT: s_wait_alu 0xfffe
816+
; GISEL-NEXT: s_mov_b32 s32, s34
817+
; GISEL-NEXT: s_mov_b32 s34, s2
818+
; GISEL-NEXT: s_mov_b32 exec_lo, s0
819+
; GISEL-NEXT: s_mov_b32 s33, s1
820+
; GISEL-NEXT: s_wait_alu 0xfffe
821+
; GISEL-NEXT: s_setpc_b64 s[30:31]
822+
;
823+
; DAGISEL64-LABEL: realign_stack:
824+
; DAGISEL64: ; %bb.0:
825+
; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
826+
; DAGISEL64-NEXT: s_wait_expcnt 0x0
827+
; DAGISEL64-NEXT: s_wait_samplecnt 0x0
828+
; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
829+
; DAGISEL64-NEXT: s_wait_kmcnt 0x0
830+
; DAGISEL64-NEXT: s_mov_b32 s2, s33
831+
; DAGISEL64-NEXT: s_add_co_i32 s33, s32, 0x3ff
832+
; DAGISEL64-NEXT: s_wait_alu 0xfffe
833+
; DAGISEL64-NEXT: s_and_b32 s33, s33, 0xfffffc00
834+
; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
835+
; DAGISEL64-NEXT: s_mov_b32 s3, s34
836+
; DAGISEL64-NEXT: s_mov_b32 s34, s32
837+
; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x800
838+
; DAGISEL64-NEXT: s_wait_storecnt 0x0
839+
; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
840+
; DAGISEL64-NEXT: s_wait_storecnt 0x0
841+
; DAGISEL64-NEXT: s_wait_alu 0xfffe
842+
; DAGISEL64-NEXT: s_mov_b32 s32, s34
843+
; DAGISEL64-NEXT: s_mov_b32 s34, s3
844+
; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
845+
; DAGISEL64-NEXT: s_mov_b32 s33, s2
846+
; DAGISEL64-NEXT: s_wait_alu 0xfffe
847+
; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
848+
;
849+
; GISEL64-LABEL: realign_stack:
850+
; GISEL64: ; %bb.0:
851+
; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
852+
; GISEL64-NEXT: s_wait_expcnt 0x0
853+
; GISEL64-NEXT: s_wait_samplecnt 0x0
854+
; GISEL64-NEXT: s_wait_bvhcnt 0x0
855+
; GISEL64-NEXT: s_wait_kmcnt 0x0
856+
; GISEL64-NEXT: s_mov_b32 s2, s33
857+
; GISEL64-NEXT: s_add_co_i32 s33, s32, 0x3ff
858+
; GISEL64-NEXT: s_wait_alu 0xfffe
859+
; GISEL64-NEXT: s_and_b32 s33, s33, 0xfffffc00
860+
; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
861+
; GISEL64-NEXT: s_mov_b32 s3, s34
862+
; GISEL64-NEXT: s_mov_b32 s34, s32
863+
; GISEL64-NEXT: s_addk_co_i32 s32, 0x800
864+
; GISEL64-NEXT: s_wait_storecnt 0x0
865+
; GISEL64-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
866+
; GISEL64-NEXT: s_wait_storecnt 0x0
867+
; GISEL64-NEXT: s_wait_alu 0xfffe
868+
; GISEL64-NEXT: s_mov_b32 s32, s34
869+
; GISEL64-NEXT: s_mov_b32 s34, s3
870+
; GISEL64-NEXT: s_mov_b64 exec, s[0:1]
871+
; GISEL64-NEXT: s_mov_b32 s33, s2
872+
; GISEL64-NEXT: s_wait_alu 0xfffe
873+
; GISEL64-NEXT: s_setpc_b64 s[30:31]
874+
;
875+
; GFX1250-DAGISEL-LABEL: realign_stack:
876+
; GFX1250-DAGISEL: ; %bb.0:
877+
; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
878+
; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0
879+
; GFX1250-DAGISEL-NEXT: s_mov_b32 s1, s33
880+
; GFX1250-DAGISEL-NEXT: s_add_co_i32 s33, s32, 0x3ff
881+
; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
882+
; GFX1250-DAGISEL-NEXT: s_and_b32 s33, s33, 0xfffffc00
883+
; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
884+
; GFX1250-DAGISEL-NEXT: s_mov_b32 s2, s34
885+
; GFX1250-DAGISEL-NEXT: s_mov_b32 s34, s32
886+
; GFX1250-DAGISEL-NEXT: s_addk_co_i32 s32, 0x800
887+
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
888+
; GFX1250-DAGISEL-NEXT: s_wait_storecnt 0x0
889+
; GFX1250-DAGISEL-NEXT: s_mov_b32 s32, s34
890+
; GFX1250-DAGISEL-NEXT: s_mov_b32 s34, s2
891+
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
892+
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s0
893+
; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s1
894+
; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31]
895+
%fussy = alloca i32, align 1024, addrspace(5)
896+
store volatile i32 %x, ptr addrspace(5) %fussy, align 1024
897+
ret void
898+
}
899+
770900
define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
771901
; DAGISEL-LABEL: multiple_blocks:
772902
; DAGISEL: ; %bb.0:

0 commit comments

Comments
 (0)