diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index dcd4f0f65e8ef..274416ec05481 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1259,6 +1259,17 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, Register FramePtrRegScratchCopy; Register SGPRForFPSaveRestoreCopy = FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); + + if (MFI.hasVarSizedObjects()) { + assert(TRI.hasBasePointer(MF) && + "Variable sized objects require base pointer to be setup!"); + Register BasePtrReg = TRI.getBaseRegister(); + // Restore SP to fixed frame size + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) + .addReg(BasePtrReg) + .addImm(RoundedSize * getScratchScaleFactor(ST)) + .setMIFlag(MachineInstr::FrameDestroy); + } if (FPSaved) { // CSR spill restores should use FP as base register. If // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 704435dad65d7..5a4e6ec48da82 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -525,8 +525,11 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { // When we need stack realignment, we can't reference off of the // stack pointer, so we reserve a base pointer. + // For functions with dynamically sized stack objects, we need to reference + // off the base pointer in the epilog to restore the stack frame. const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.getNumFixedObjects() && shouldRealignStack(MF); + return (MFI.getNumFixedObjects() && shouldRealignStack(MF)) || + MFI.hasVarSizedObjects(); } Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll index ae055ea041297..e497ed6526a05 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -69,6 +69,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_mov_b32 s8, s34 +; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 @@ -86,6 +88,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s32, s6, s4 +; GFX9-NEXT: s_add_i32 s32, s34, 0x400 +; GFX9-NEXT: s_mov_b32 s34, s8 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -95,6 +99,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_mov_b32 s8, s34 +; GFX10-NEXT: s_mov_b32 s34, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 @@ -112,6 +118,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 ; GFX10-NEXT: s_add_u32 s32, s6, s4 +; GFX10-NEXT: s_add_i32 s32, s34, 0x200 +; GFX10-NEXT: s_mov_b32 s34, s8 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -120,13 +128,15 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_mov_b32 s4, s34 +; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s2, s32 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -136,8 +146,10 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, -16 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_u32 s32, s2, s0 +; GFX11-NEXT: s_add_i32 s32, s34, 16 +; GFX11-NEXT: s_mov_b32 s34, s4 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv, align 4 @@ -210,6 +222,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_mov_b32 s8, s34 +; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 @@ -227,6 +241,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s32, s6, s4 +; GFX9-NEXT: s_add_i32 s32, s34, 0x400 +; GFX9-NEXT: s_mov_b32 s34, s8 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -236,6 +252,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_mov_b32 s8, s34 +; GFX10-NEXT: s_mov_b32 s34, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 @@ -253,6 +271,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 ; GFX10-NEXT: s_add_u32 s32, s6, s4 +; GFX10-NEXT: s_add_i32 s32, s34, 0x200 +; GFX10-NEXT: s_mov_b32 s34, s8 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -261,13 +281,15 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_mov_b32 s4, s34 +; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s2, s32 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -277,8 +299,10 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, -16 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_u32 s32, s2, s0 +; GFX11-NEXT: s_add_i32 s32, s34, 16 +; GFX11-NEXT: s_mov_b32 s34, s4 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv, align 16 @@ -355,6 +379,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GFX9-NEXT: s_mov_b32 s7, s34 +; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x1000 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 @@ -373,6 +399,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s32, s5, s4 +; GFX9-NEXT: s_add_i32 s32, s34, 0x1000 +; GFX9-NEXT: s_mov_b32 s34, s7 ; GFX9-NEXT: s_addk_i32 s32, 0xf000 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -382,8 +410,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_add_i32 s33, s32, 0x3e0 -; GFX10-NEXT: s_addk_i32 s32, 0x800 +; GFX10-NEXT: s_mov_b32 s7, s34 ; GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GFX10-NEXT: s_mov_b32 s34, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x800 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 @@ -401,6 +431,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 ; GFX10-NEXT: s_add_u32 s32, s5, s4 +; GFX10-NEXT: s_add_i32 s32, s34, 0x800 +; GFX10-NEXT: s_mov_b32 s34, s7 ; GFX10-NEXT: s_addk_i32 s32, 0xf800 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -409,8 +441,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 31 -; GFX11-NEXT: s_add_i32 s32, s32, 64 +; GFX11-NEXT: s_mov_b32 s3, s34 ; GFX11-NEXT: s_and_not1_b32 s33, s33, 31 +; GFX11-NEXT: s_mov_b32 s34, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 64 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12 @@ -429,7 +463,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-NEXT: s_add_u32 s32, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s32, s34, 64 +; GFX11-NEXT: s_mov_b32 s34, s3 ; GFX11-NEXT: s_addk_i32 s32, 0xffc0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index 69abef02d3d92..f4e6b7c033b3c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -151,8 +151,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, s33 +; GCN-NEXT: s_mov_b32 s8, s34 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB2_3 @@ -178,8 +180,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; GCN-NEXT: .LBB2_3: ; %bb.2 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_add_i32 s32, s34, 0x400 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s34, s8 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s7 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -216,8 +220,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0xfc0 +; GCN-NEXT: s_mov_b32 s8, s34 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_addk_i32 s32, 0x2000 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB3_2 @@ -240,8 +246,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-NEXT: .LBB3_2: ; %bb.1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_add_i32 s32, s34, 0x2000 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s34, s8 ; GCN-NEXT: s_addk_i32 s32, 0xe000 ; GCN-NEXT: s_mov_b32 s33, s7 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll index a5f915c48ebee..2bd58f41ec790 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -149,15 +149,15 @@ attributes #0 = { nounwind } ; GCN-NEXT: dynamic_stack: ; GCN-NEXT: .backend_stack_size: 0x10{{$}} ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x28{{$}} +; GCN-NEXT: .sgpr_count: 0x2a{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; SDAG-NEXT: .vgpr_count: 0x2{{$}} ; GISEL-NEXT: .vgpr_count: 0x3{{$}} ; GCN-NEXT: dynamic_stack_loop: ; GCN-NEXT: .backend_stack_size: 0x10{{$}} ; GCN-NEXT: .lds_size: 0{{$}} -; SDAG-NEXT: .sgpr_count: 0x25{{$}} -; GISEL-NEXT: .sgpr_count: 0x26{{$}} +; SDAG-NEXT: .sgpr_count: 0x27{{$}} +; GISEL-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; SDAG-NEXT: .vgpr_count: 0x3{{$}} ; GISEL-NEXT: .vgpr_count: 0x4{{$}} @@ -182,22 +182,22 @@ attributes #0 = { nounwind } ; GCN-NEXT: no_stack_extern_call: ; GCN-NEXT: .backend_stack_size: 0x10{{$}} ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x28{{$}} -; GFX9-NEXT: .sgpr_count: 0x2c{{$}} +; GFX8-NEXT: .sgpr_count: 0x2a{{$}} +; GFX9-NEXT: .sgpr_count: 0x2e{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: no_stack_extern_call_many_args: ; GCN-NEXT: .backend_stack_size: 0x90{{$}} ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x28{{$}} -; GFX9-NEXT: .sgpr_count: 0x2c{{$}} +; GFX8-NEXT: .sgpr_count: 0x2a{{$}} +; GFX9-NEXT: .sgpr_count: 0x2e{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}} ; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: no_stack_indirect_call: ; GCN-NEXT: .backend_stack_size: 0x10{{$}} ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x28{{$}} -; GFX9-NEXT: .sgpr_count: 0x2c{{$}} +; GFX8-NEXT: .sgpr_count: 0x2a{{$}} +; GFX9-NEXT: .sgpr_count: 0x2e{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: simple_lds: @@ -227,15 +227,15 @@ attributes #0 = { nounwind } ; GCN-NEXT: simple_stack_extern_call: ; GCN-NEXT: .backend_stack_size: 0x20{{$}} ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x28{{$}} -; GFX9-NEXT: .sgpr_count: 0x2c{{$}} +; GFX8-NEXT: .sgpr_count: 0x2a{{$}} +; GFX9-NEXT: .sgpr_count: 0x2e{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} ; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: simple_stack_indirect_call: ; GCN-NEXT: .backend_stack_size: 0x20{{$}} ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x28{{$}} -; GFX9-NEXT: .sgpr_count: 0x2c{{$}} +; GFX8-NEXT: .sgpr_count: 0x2a{{$}} +; GFX9-NEXT: .sgpr_count: 0x2e{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} ; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: simple_stack_recurse: diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index 9acb3a42ae102..c298a5609ddb3 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -1064,10 +1064,12 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s34 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec ; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 ; GFX9-SDAG-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1082,8 +1084,10 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-SDAG-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s34, s10 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1093,10 +1097,12 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 ; GFX9-GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1111,8 +1117,10 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1122,17 +1130,18 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 ; GFX11-SDAG-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB8_1 @@ -1144,7 +1153,8 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s34, 16 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s5 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1153,28 +1163,31 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo ; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 ; GFX11-GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX11-GISEL-NEXT: ; %bb.2: -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s34, 16 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, addrspace(5) @@ -1189,10 +1202,12 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s34 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec ; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1208,8 +1223,10 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 10 +; GFX9-SDAG-NEXT: s_add_i32 s32, s34, 0x4000 ; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s34, s10 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1220,10 +1237,12 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1239,8 +1258,10 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 10 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: s_add_i32 s32, s34, 0x4000 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xc000 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1251,17 +1272,18 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-SDAG-NEXT: s_mov_b32 s5, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 ; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB9_1 @@ -1274,7 +1296,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s34, 0x100 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s5 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1284,17 +1307,18 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100 ; GFX11-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB9_1 @@ -1305,8 +1329,10 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s34, 0x100 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff00 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, align 128, addrspace(5) @@ -1320,10 +1346,12 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s34 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec ; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 ; GFX9-SDAG-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1338,8 +1366,10 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22 +; GFX9-SDAG-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s34, s10 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1349,10 +1379,12 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 ; GFX9-GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1367,8 +1399,10 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 22 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1378,17 +1412,18 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 ; GFX11-SDAG-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB10_1 @@ -1400,7 +1435,8 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s34, 16 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s5 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1409,28 +1445,31 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo ; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 ; GFX11-GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX11-GISEL-NEXT: ; %bb.2: -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 22 ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 22 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s34, 16 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, align 2, addrspace(5) @@ -1445,10 +1484,12 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s34 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec ; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 ; GFX9-SDAG-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1463,8 +1504,10 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-SDAG-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s34, s10 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1475,10 +1518,12 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 ; GFX9-GISEL-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1493,8 +1538,10 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1504,10 +1551,12 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 @@ -1528,7 +1577,8 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s34, 16 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s5 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1537,10 +1587,12 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo ; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 @@ -1554,13 +1606,15 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX11-GISEL-NEXT: ; %bb.2: -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s34, 16 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1575,6 +1629,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s10, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff @@ -1596,8 +1652,10 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc +; GFX9-SDAG-NEXT: s_add_i32 s32, s34, 0x4000 ; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s34, s11 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s10 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1609,10 +1667,12 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1628,8 +1688,10 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: s_add_i32 s32, s34, 0x4000 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xc000 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1640,14 +1702,16 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 ; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff ; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 ; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX11-SDAG-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -1663,8 +1727,10 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: s_add_i32 s32, s34, 0x100 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1674,12 +1740,13 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX11-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 @@ -1697,8 +1764,10 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s34, 0x100 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff00 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1714,10 +1783,12 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s34 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec ; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 ; GFX9-SDAG-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1732,8 +1803,10 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX9-SDAG-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s34, s10 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1744,10 +1817,12 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 ; GFX9-GISEL-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1762,8 +1837,10 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1773,10 +1850,12 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 @@ -1797,7 +1876,8 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s34, 16 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s5 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1806,10 +1886,12 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo ; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 @@ -1823,13 +1905,15 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB13_1 ; GFX11-GISEL-NEXT: ; %bb.2: -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s34, 16 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1844,9 +1928,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s13, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-SDAG-NEXT: s_mov_b32 s14, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x3000 ; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-SDAG-NEXT: s_cbranch_execz .LBB14_6 @@ -1911,10 +1997,12 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-SDAG-NEXT: s_add_i32 s32, s34, 0x3000 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s34, s14 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xd000 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s13 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1924,9 +2012,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s13, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-GISEL-NEXT: s_mov_b32 s14, s34 ; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x3000 ; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-GISEL-NEXT: s_cbranch_execz .LBB14_6 @@ -1986,14 +2076,16 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: ; %bb.8: ; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 ; GFX9-GISEL-NEXT: s_lshl_b32 s5, s8, 6 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_add_i32 s32, s34, 0x3000 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s34, s14 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xd000 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s13 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2003,9 +2095,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s7, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s8, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xc0 ; GFX11-SDAG-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_6 @@ -2079,9 +2173,10 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s7 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s34, 0xc0 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s8 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff40 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s7 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas: @@ -2089,9 +2184,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_mov_b32 s7, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s8, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo ; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xc0 ; GFX11-GISEL-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-GISEL-NEXT: s_cbranch_execz .LBB14_6 @@ -2154,16 +2251,18 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_7 ; GFX11-GISEL-NEXT: ; %bb.8: -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s33 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s7 +; GFX11-GISEL-NEXT: s_add_i32 s32, s34, 0xc0 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s8 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff40 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s7 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %n, 0 @@ -2189,9 +2288,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s11, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-SDAG-NEXT: s_mov_b32 s12, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 ; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x2000 ; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2243,6 +2344,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: .LBB15_8: ; %bb.2 ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-SDAG-NEXT: s_add_i32 s32, s34, 0x2000 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s12 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xe000 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s11 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -2253,9 +2356,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s11, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-GISEL-NEXT: s_mov_b32 s12, s34 ; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x2000 ; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2307,6 +2412,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: .LBB15_8: ; %bb.2 ; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-GISEL-NEXT: s_add_i32 s32, s34, 0x2000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s12 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xe000 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s11 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -2317,9 +2424,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x80 ; GFX11-SDAG-NEXT: v_cmpx_ne_u32_e32 0, v0 ; GFX11-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2376,7 +2485,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: .LBB15_8: ; %bb.2 ; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s34, 0x80 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff80 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2386,9 +2496,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_mov_b32 s5, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s6, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x80 ; GFX11-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v0 ; GFX11-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2446,6 +2558,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: .LBB15_8: ; %bb.2 ; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s34, 0x80 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s6 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff80 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2472,10 +2586,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s34 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec ; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 ; GFX9-SDAG-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -2490,8 +2606,10 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX9-SDAG-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s34, s10 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2502,10 +2620,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 ; GFX9-GISEL-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -2520,8 +2640,10 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2531,10 +2653,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0 @@ -2555,7 +2679,8 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s34, 16 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s5 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2564,10 +2689,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo ; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 @@ -2581,13 +2708,15 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX11-GISEL-NEXT: ; %bb.2: -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s34, 16 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i16 %n, align 2, addrspace(5) @@ -2601,10 +2730,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s34 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec ; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 ; GFX9-SDAG-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -2619,8 +2750,10 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a +; GFX9-SDAG-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s34, s10 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2630,10 +2763,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 ; GFX9-GISEL-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -2648,8 +2783,10 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: s_add_i32 s32, s34, 0x400 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2659,17 +2796,18 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 ; GFX11-SDAG-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB17_1 @@ -2681,7 +2819,8 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_add_i32 s32, s34, 16 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s5 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2690,28 +2829,31 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo ; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 ; GFX11-GISEL-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX11-GISEL-NEXT: ; %bb.2: -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_add_i32 s32, s34, 16 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i64 %n, align 2, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index 2bd60e869f843..da130bc403035 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -219,8 +219,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MUBUF-NEXT: s_mov_b32 s7, s33 +; MUBUF-NEXT: s_mov_b32 s8, s34 ; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_mov_b32 s34, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc ; MUBUF-NEXT: s_cbranch_execz .LBB2_3 @@ -244,8 +246,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; MUBUF-NEXT: .LBB2_3: ; %bb.2 ; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: s_add_i32 s32, s34, 0x400 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off ; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_mov_b32 s34, s8 ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 ; MUBUF-NEXT: s_mov_b32 s33, s7 ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -254,8 +258,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; FLATSCR: ; %bb.0: ; %entry ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s3, s33 +; FLATSCR-NEXT: s_mov_b32 s4, s34 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_mov_b32 s34, s32 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 @@ -278,8 +284,10 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; FLATSCR-NEXT: .LBB2_3: ; %bb.2 ; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s32, s34, 16 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s34, s4 ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; FLATSCR-NEXT: s_mov_b32 s33, s3 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -316,8 +324,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MUBUF-NEXT: s_mov_b32 s7, s33 ; MUBUF-NEXT: s_add_i32 s33, s32, 0xfc0 +; MUBUF-NEXT: s_mov_b32 s8, s34 ; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfffff000 +; MUBUF-NEXT: s_mov_b32 s34, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x2000 ; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc ; MUBUF-NEXT: s_cbranch_execz .LBB3_2 @@ -339,8 +349,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-NEXT: .LBB3_2: ; %bb.1 ; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: s_add_i32 s32, s34, 0x2000 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off ; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_mov_b32 s34, s8 ; MUBUF-NEXT: s_addk_i32 s32, 0xe000 ; MUBUF-NEXT: s_mov_b32 s33, s7 ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -350,8 +362,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s3, s33 ; FLATSCR-NEXT: s_add_i32 s33, s32, 63 +; FLATSCR-NEXT: s_mov_b32 s4, s34 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_andn2_b32 s33, s33, 63 +; FLATSCR-NEXT: s_mov_b32 s34, s32 ; FLATSCR-NEXT: s_addk_i32 s32, 0x80 ; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; FLATSCR-NEXT: s_cbranch_execz .LBB3_2 @@ -371,8 +385,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; FLATSCR-NEXT: .LBB3_2: ; %bb.1 ; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s32, s34, 0x80 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s34, s4 ; FLATSCR-NEXT: s_addk_i32 s32, 0xff80 ; FLATSCR-NEXT: s_mov_b32 s33, s3 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll index 538ce15979de8..eb67d92fa34ef 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll @@ -68,13 +68,13 @@ ; CHECK-NEXT: dynamic_stack: ; CHECK-NEXT: .backend_stack_size: 0x10 ; CHECK-NEXT: .lds_size: 0 -; CHECK-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .sgpr_count: 0x23 ; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 ; CHECK-NEXT: .vgpr_count: 0x2 ; CHECK-NEXT: dynamic_stack_loop: ; CHECK-NEXT: .backend_stack_size: 0x10 ; CHECK-NEXT: .lds_size: 0 -; CHECK-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .sgpr_count: 0x23 ; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 ; CHECK-NEXT: .vgpr_count: 0x3 ; CHECK-NEXT: multiple_stack: