diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 2e2523312840a..060db477a59f8 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1256,6 +1256,18 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, Register FramePtrReg = FuncInfo->getFrameOffsetReg(); bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg); + if (RoundedSize != 0) { + if (TRI.hasBasePointer(MF)) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg) + .addReg(TRI.getBaseRegister()) + .setMIFlag(MachineInstr::FrameDestroy); + } else if (hasFP(MF)) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg) + .addReg(FramePtrReg) + .setMIFlag(MachineInstr::FrameDestroy); + } + } + Register FramePtrRegScratchCopy; Register SGPRForFPSaveRestoreCopy = FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); @@ -1280,14 +1292,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, FramePtrRegScratchCopy); } - if (RoundedSize != 0 && hasFP(MF)) { - auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) - .addReg(StackPtrReg) - .addImm(-static_cast(RoundedSize * getScratchScaleFactor(ST))) - .setMIFlag(MachineInstr::FrameDestroy); - Add->getOperand(3).setIsDead(); // Mark SCC as dead. - } - if (FPSaved) { // Insert the copy to restore FP. Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 11121e6058770..6fc57dec6a826 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -525,8 +525,7 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { // When we need stack realignment, we can't reference off of the // stack pointer, so we reserve a base pointer. - const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.getNumFixedObjects() && shouldRealignStack(MF); + return shouldRealignStack(MF); } Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll index 604caf572b0fe..c477732e5cd59 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -27,11 +27,11 @@ define ptr addrspace(1) @call_assert_align() { ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 974ce492daea8..410d3b1bb7062 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -247,11 +247,11 @@ define void @func_caller_stack() { ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] -; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 ; MUBUF-NEXT: s_mov_b32 s33, s4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -286,11 +286,11 @@ define void @func_caller_stack() { ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] -; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; FLATSCR-NEXT: s_mov_b32 s33, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -372,11 +372,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) { ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] -; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 ; MUBUF-NEXT: s_mov_b32 s33, s4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -437,11 +437,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) { ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] -; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; FLATSCR-NEXT: s_mov_b32 s33, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll index ae055ea041297..6b767d9e754be 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -80,13 +80,13 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s33, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s32, s6, s4 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_mov_b32 s33, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -103,7 +103,6 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -112,7 +111,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 ; GFX10-NEXT: s_add_u32 s32, s6, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s32, s33 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4: @@ -127,7 +127,6 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s2, s32 -; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -136,9 +135,10 @@ define void @func_dynamic_stackalloc_sgpr_align4() { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, -16 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_u32 s32, s2, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s32, s33 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv, align 4 %alloca = alloca i32, i32 %n, addrspace(5) @@ -221,13 +221,13 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s33, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s32, s6, s4 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_mov_b32 s33, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -244,7 +244,6 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 @@ -253,7 +252,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 ; GFX10-NEXT: s_add_u32 s32, s6, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_mov_b32 s32, s33 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16: @@ -268,7 +268,6 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s2, s32 -; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: scratch_store_b32 off, v0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -277,9 +276,10 @@ define void @func_dynamic_stackalloc_sgpr_align16() { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, s0, -16 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_u32 s32, s2, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s32, s33 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv, align 16 %alloca = alloca i32, i32 %n, addrspace(5) @@ -355,6 +355,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GFX9-NEXT: s_mov_b32 s7, s34 +; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x1000 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 @@ -373,7 +375,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX9-NEXT: s_and_b32 s4, s4, -16 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-NEXT: s_add_u32 s32, s5, s4 -; GFX9-NEXT: s_addk_i32 s32, 0xf000 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: s_mov_b32 s34, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -382,8 +385,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_add_i32 s33, s32, 0x3e0 -; GFX10-NEXT: s_addk_i32 s32, 0x800 +; GFX10-NEXT: s_mov_b32 s7, s34 ; GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GFX10-NEXT: s_mov_b32 s34, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x800 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 @@ -401,7 +406,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX10-NEXT: s_and_b32 s4, s4, -16 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5 ; GFX10-NEXT: s_add_u32 s32, s5, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xf800 +; GFX10-NEXT: s_mov_b32 s32, s34 +; GFX10-NEXT: s_mov_b32 s34, s7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32: @@ -409,8 +415,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 31 -; GFX11-NEXT: s_add_i32 s32, s32, 64 +; GFX11-NEXT: s_mov_b32 s3, s34 ; GFX11-NEXT: s_and_not1_b32 s33, s33, 31 +; GFX11-NEXT: s_mov_b32 s34, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 64 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12 @@ -429,8 +437,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-NEXT: s_add_u32 s32, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_addk_i32 s32, 0xffc0 +; GFX11-NEXT: s_mov_b32 s32, s34 +; GFX11-NEXT: s_mov_b32 s34, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, ptr addrspace(4) @gv %alloca = alloca i32, i32 %n, align 32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 009beeb395100..767232a01c7e5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -248,11 +248,11 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) { ; GFX9-NEXT: s_swappc_b64 s[30:31], 0 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index 69abef02d3d92..34cf6905fe75b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -180,7 +180,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_mov_b32 s33, s7 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -216,8 +216,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0xfc0 +; GCN-NEXT: s_mov_b32 s8, s34 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_addk_i32 s32, 0x2000 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB3_2 @@ -242,7 +244,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_addk_i32 s32, 0xe000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s8 ; GCN-NEXT: s_mov_b32 s33, s7 ; GCN-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index e53653408feb4..194a23fa0d4a9 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -32,11 +32,11 @@ define void @parent_func_missing_inputs() #0 { ; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0 +; FIXEDABI-NEXT: s_mov_b32 s32, s33 ; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2 ; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; FIXEDABI-NEXT: s_mov_b64 exec, s[6:7] -; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00 ; FIXEDABI-NEXT: s_mov_b32 s33, s4 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index 25b6b7be1f3b5..ab2363860af9d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -193,11 +193,11 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -329,11 +329,11 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -477,11 +477,11 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -614,11 +614,11 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: v_readlane_b32 s34, v42, 2 ; CHECK-NEXT: v_readlane_b32 s31, v42, 1 ; CHECK-NEXT: v_readlane_b32 s30, v42, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v42, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -761,11 +761,11 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: v_readlane_b32 s34, v43, 2 ; CHECK-NEXT: v_readlane_b32 s31, v43, 1 ; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xf800 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index ec469b3020cce..0382cc72a36ae 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -3798,10 +3798,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v2, 1 ; GCN-NEXT: v_readlane_b32 s30, v2, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -3829,10 +3829,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v2, 1 ; GFX7-NEXT: v_readlane_b32 s30, v2, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3858,10 +3858,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3887,10 +3887,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3917,11 +3917,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3947,10 +3947,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v2, 1 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3990,10 +3990,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v4, 1 ; GCN-NEXT: v_readlane_b32 s30, v4, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -4026,10 +4026,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v4, 1 ; GFX7-NEXT: v_readlane_b32 s30, v4, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4055,10 +4055,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4084,10 +4084,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4114,11 +4114,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4144,10 +4144,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v2, 1 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4189,10 +4189,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v5, 1 ; GCN-NEXT: v_readlane_b32 s30, v5, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -4227,10 +4227,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v4, 1 ; GFX7-NEXT: v_readlane_b32 s30, v4, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4259,10 +4259,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v4, 1 ; GFX8-NEXT: v_readlane_b32 s30, v4, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4290,10 +4290,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4322,11 +4322,11 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4354,10 +4354,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v3, 1 ; GFX11-NEXT: v_readlane_b32 s30, v3, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4407,10 +4407,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v8, 1 ; GCN-NEXT: v_readlane_b32 s30, v8, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -4453,10 +4453,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v6, 1 ; GFX7-NEXT: v_readlane_b32 s30, v6, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4485,10 +4485,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v4, 1 ; GFX8-NEXT: v_readlane_b32 s30, v4, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4516,10 +4516,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4548,11 +4548,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4578,10 +4578,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v3, 1 ; GFX11-NEXT: v_readlane_b32 s30, v3, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4651,10 +4651,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v16, 1 ; GCN-NEXT: v_readlane_b32 s30, v16, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -4717,10 +4717,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v10, 1 ; GFX7-NEXT: v_readlane_b32 s30, v10, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -4755,10 +4755,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v6, 1 ; GFX8-NEXT: v_readlane_b32 s30, v6, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4790,10 +4790,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v5, 1 ; GFX9-NEXT: v_readlane_b32 s30, v5, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4826,11 +4826,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v5, 1 ; GFX10-NEXT: v_readlane_b32 s30, v5, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4856,10 +4856,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v5, 1 ; GFX11-NEXT: v_readlane_b32 s30, v5, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4969,10 +4969,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v20, 1 ; GCN-NEXT: v_readlane_b32 s30, v20, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -5075,10 +5075,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v18, 1 ; GFX7-NEXT: v_readlane_b32 s30, v18, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -5125,10 +5125,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v10, 1 ; GFX8-NEXT: v_readlane_b32 s30, v10, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_addk_i32 s32, 0xfc00 ; GFX8-NEXT: s_mov_b32 s33, s18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5168,10 +5168,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readlane_b32 s31, v9, 1 ; GFX9-NEXT: v_readlane_b32 s30, v9, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5212,11 +5212,11 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v9, 1 ; GFX10-NEXT: v_readlane_b32 s30, v9, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s18 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5244,10 +5244,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v9, 1 ; GFX11-NEXT: v_readlane_b32 s30, v9, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index cdfaed0a203e9..9912ce3604a49 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -3099,8 +3099,11 @@ define i160 @load_i160(ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: s_mov_b32 s4, s33 ; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0 ; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800 +; SDAG-NEXT: s_mov_b32 s5, s34 +; SDAG-NEXT: s_mov_b32 s34, s32 ; SDAG-NEXT: s_addk_i32 s32, 0x1800 -; SDAG-NEXT: s_addk_i32 s32, 0xe800 +; SDAG-NEXT: s_mov_b32 s32, s34 +; SDAG-NEXT: s_mov_b32 s34, s5 ; SDAG-NEXT: s_mov_b32 s33, s4 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -3124,10 +3127,13 @@ define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) { ; SDAG-NEXT: s_mov_b32 s4, s33 ; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0 ; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800 +; SDAG-NEXT: s_mov_b32 s5, s34 +; SDAG-NEXT: s_mov_b32 s34, s32 ; SDAG-NEXT: s_addk_i32 s32, 0x1000 ; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; SDAG-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 -; SDAG-NEXT: s_addk_i32 s32, 0xf000 +; SDAG-NEXT: s_mov_b32 s32, s34 +; SDAG-NEXT: s_mov_b32 s34, s5 ; SDAG-NEXT: s_mov_b32 s33, s4 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll index 93a4469c7718e..704b68aa296a9 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll @@ -51,11 +51,11 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -79,11 +79,11 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -111,11 +111,11 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -139,11 +139,11 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -171,11 +171,11 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -199,11 +199,11 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -232,11 +232,11 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -260,11 +260,11 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -293,11 +293,11 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -321,11 +321,11 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -355,11 +355,11 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -383,11 +383,11 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -418,11 +418,11 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -446,11 +446,11 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -485,11 +485,11 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[24:25] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -513,11 +513,11 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -545,11 +545,11 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -573,11 +573,11 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -605,11 +605,11 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -633,11 +633,11 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -665,11 +665,11 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -693,11 +693,11 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -726,11 +726,11 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -754,11 +754,11 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -786,11 +786,11 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -814,11 +814,11 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -847,11 +847,11 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -875,11 +875,11 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -908,11 +908,11 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -936,11 +936,11 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -969,11 +969,11 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -997,11 +997,11 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1030,11 +1030,11 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1058,11 +1058,11 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1091,11 +1091,11 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1119,11 +1119,11 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1151,11 +1151,11 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1179,11 +1179,11 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1214,11 +1214,11 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1242,11 +1242,11 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1275,11 +1275,11 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1303,11 +1303,11 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1339,11 +1339,11 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1367,11 +1367,11 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1411,11 +1411,11 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX9-NEXT: s_swappc_b64 s[30:31], vcc ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1439,11 +1439,11 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1486,11 +1486,11 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1514,11 +1514,11 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 26ab0f3ce6355..35d00390067d8 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -5873,11 +5873,11 @@ define void @stack_12xv3i32() #0 { ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] -; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5941,11 +5941,11 @@ define void @stack_12xv3i32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] -; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] @@ -6009,11 +6009,11 @@ define void @stack_12xv3i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6059,11 +6059,11 @@ define void @stack_12xv3i32() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6127,11 +6127,11 @@ define void @stack_12xv3i32() #0 { ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] -; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: s_setpc_b64 s[30:31] @@ -6212,11 +6212,11 @@ define void @stack_12xv3f32() #0 { ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] -; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6280,11 +6280,11 @@ define void @stack_12xv3f32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] -; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] @@ -6348,11 +6348,11 @@ define void @stack_12xv3f32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6402,11 +6402,11 @@ define void @stack_12xv3f32() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6470,11 +6470,11 @@ define void @stack_12xv3f32() #0 { ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] -; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: s_setpc_b64 s[30:31] @@ -6563,11 +6563,11 @@ define void @stack_8xv5i32() #0 { ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] -; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6639,11 +6639,11 @@ define void @stack_8xv5i32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] -; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] @@ -6715,11 +6715,11 @@ define void @stack_8xv5i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6770,11 +6770,11 @@ define void @stack_8xv5i32() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6846,11 +6846,11 @@ define void @stack_8xv5i32() #0 { ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] -; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: s_setpc_b64 s[30:31] @@ -6935,11 +6935,11 @@ define void @stack_8xv5f32() #0 { ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] -; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7011,11 +7011,11 @@ define void @stack_8xv5f32() #0 { ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] -; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] @@ -7087,11 +7087,11 @@ define void @stack_8xv5f32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7145,11 +7145,11 @@ define void @stack_8xv5f32() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7221,11 +7221,11 @@ define void @stack_8xv5f32() #0 { ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] -; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index e926a3c728cbd..6fb071dd42d2f 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -58,8 +58,8 @@ define void @callee_with_stack() #0 { ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33{{$}} ; FLATSCR-NEXT: scratch_store_dword off, v0, s33{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_addk_i32 s32, 0xfe00 -; FLATSCR-NEXT: s_add_i32 s32, s32, -8 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_all() #1 { @@ -106,13 +106,13 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 { ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]] ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]] +; MUBUF: s_mov_b32 s32, s33{{$}} +; FLATSCR: s_mov_b32 s32, s33{{$}} ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_addk_i32 s32, 0xfc00{{$}} -; FLATSCR: s_add_i32 s32, s32, -16{{$}} ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -149,13 +149,13 @@ define void @callee_with_stack_and_call() #0 { ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0 ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1 +; MUBUF: s_mov_b32 s32, s33 +; FLATSCR: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], [[FP_SPILL_LANE]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_addk_i32 s32, 0xfc00 -; FLATSCR: s_add_i32 s32, s32, -16 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -253,10 +253,10 @@ define void @spill_only_csr_sgpr() { ; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload ; MUBUF: s_addk_i32 s32, 0x300 -; MUBUF-NEXT: s_addk_i32 s32, 0xfd00 +; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_mov_b32 s33, s4 ; FLATSCR: s_add_i32 s32, s32, 12 -; FLATSCR-NEXT: s_add_i32 s32, s32, -12 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -282,16 +282,14 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4 ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1 -; MUBUF: s_addk_i32 s32, 0x400 -; FLATSCR: s_add_i32 s32, s32, 16 +; MUBUF: s_mov_b32 s32, s33 +; FLATSCR: s_mov_b32 s32, s33 + ; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_addk_i32 s32, 0xfc00 -; FLATSCR: s_add_i32 s32, s32, -16 ; GCN-NEXT: s_mov_b32 s33, [[TMP_SGPR]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -333,12 +331,12 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; MUBUF: s_addk_i32 s32, 0x400 ; FLATSCR: s_add_i32 s32, s32, 16 ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 -; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -366,16 +364,22 @@ define void @no_new_vgpr_for_fp_csr() #1 { ; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 ; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 +; MUBUF-NEXT: s_mov_b32 s5, s34 +; FLATSCR-NEXT: s_mov_b32 s1, s34 +; MUBUF-NEXT: s_mov_b32 s34, s32 +; FLATSCR-NEXT: s_mov_b32 s34, s32 ; MUBUF-NEXT: s_add_i32 s32, s32, 0x180000 ; FLATSCR-NEXT: s_addk_i32 s32, 0x6000 ; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; MUBUF-NEXT: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x2000{{$}} ; MUBUF-NEXT: buffer_store_dword [[ZERO]], [[OFFSET]], s[0:3], s33 offen{{$}} -; FLATSCR-NEXT: s_add_i32 s1, s33, 0x2000 -; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s1 +; FLATSCR-NEXT: s_add_i32 s2, s33, 0x2000 +; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_add_i32 s32, s32, 0xffe80000 -; FLATSCR-NEXT: s_addk_i32 s32, 0xa000 +; MUBUF-NEXT: s_mov_b32 s32, s34 +; MUBUF-NEXT: s_mov_b32 s34, s5 +; FLATSCR-NEXT: s_mov_b32 s32, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s1 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_setpc_b64 define void @realign_stack_no_fp_elim() #1 { @@ -399,16 +403,13 @@ define void @realign_stack_no_fp_elim() #1 { ; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: ;;#ASMSTART -; MUBUF: s_addk_i32 s32, 0x300 -; FLATSCR: s_add_i32 s32, s32, 12 ; GCN: v_readlane_b32 s31, [[CSR_VGPR]], 1 ; GCN: v_readlane_b32 s30, [[CSR_VGPR]], 0 +;GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_addk_i32 s32, 0xfd00 -; FLATSCR: s_add_i32 s32, s32, -12 ; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -435,19 +436,19 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] +; GCN: v_mov_b32_e32 -; MUBUF-DAG: buffer_store_dword -; FLATSCR-DAG: scratch_store_dword ; MUBUF: s_addk_i32 s32, 0x300{{$}} ; FLATSCR: s_add_i32 s32, s32, 12{{$}} +; MUBUF-DAG: buffer_store_dword +; FLATSCR-DAG: scratch_store_dword ; GCN: ;;#ASMSTART +; GCN: s_mov_b32 s32, s33 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_addk_i32 s32, 0xfd00{{$}} -; FLATSCR: s_add_i32 s32, s32, -12{{$}} ; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -495,8 +496,6 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: s_add_i32 s32, s32, 0xfffbfd00{{$}} -; FLATSCR: s_addk_i32 s32, 0xeff4{{$}} ; GCN-NEXT: s_mov_b32 s33, vcc_lo ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -538,8 +537,6 @@ define internal void @local_empty_func() #0 { ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}} ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33{{$}} ; GCN: s_swappc_b64 -; MUBUF: s_addk_i32 s32, 0xfc00 -; FLATSCR: s_add_i32 s32, s32, -16 ; GCN: s_mov_b32 s33, [[TMP_SGPR]] define void @ipra_call_with_stack() #0 { %alloca = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll index b52e7918b27ab..f85cea1ba03fb 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -428,8 +428,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { ; GCN: s_swappc_b64 +; GCN: s_mov_b32 s32, s33 ; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN: s_addk_i32 s32, 0xfc00{{$}} ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 9792c9dabac2f..2399112e3fefb 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -441,8 +441,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { ; GCN: s_swappc_b64 +; GCN: s_mov_b32 s32, s33 ; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN: s_addk_i32 s32, 0xfc00{{$}} ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 5e6152661aeec..9bef0b7d76ad5 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -42,11 +42,11 @@ define float @call_split_type_used_outside_block_v2f32() #0 { ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -78,11 +78,11 @@ define float @call_split_type_used_outside_block_v3f32() #0 { ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -114,11 +114,11 @@ define half @call_split_type_used_outside_block_v4f16() #0 { ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -151,11 +151,11 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: v_mov_b32_e32 v1, v4 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll index 2cd3916165fe7..8b02bdbb70b7b 100644 --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -91,11 +91,11 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: v_readlane_b32 s34, v41, 2 ; CHECK-NEXT: v_readlane_b32 s31, v41, 1 ; CHECK-NEXT: v_readlane_b32 s30, v41, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v41, 16 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index 9acb3a42ae102..d61c4b46596c0 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -1084,7 +1084,7 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1113,7 +1113,7 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1140,12 +1140,11 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform: @@ -1171,11 +1170,12 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, addrspace(5) store volatile i32 123, ptr addrspace(5) %alloca @@ -1189,10 +1189,12 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s34 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec ; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1210,7 +1212,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 10 ; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s10 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1220,10 +1223,12 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1241,7 +1246,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1251,17 +1257,18 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-SDAG-NEXT: s_mov_b32 s5, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 ; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB9_1 @@ -1274,8 +1281,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s5 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: @@ -1284,17 +1291,18 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100 ; GFX11-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 ; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB9_1 @@ -1307,7 +1315,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, align 128, addrspace(5) store volatile i32 10, ptr addrspace(5) %alloca @@ -1340,7 +1349,7 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 22 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1369,7 +1378,7 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1396,12 +1405,11 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 22 ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned: @@ -1427,11 +1435,12 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 22 ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, align 2, addrspace(5) store volatile i32 22, ptr addrspace(5) %alloca @@ -1465,7 +1474,7 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1495,7 +1504,7 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1524,12 +1533,11 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent: @@ -1557,11 +1565,12 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, addrspace(5) @@ -1575,6 +1584,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s10, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff @@ -1598,7 +1609,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc ; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s11 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s10 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1609,10 +1621,12 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] @@ -1630,7 +1644,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xc000 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1640,14 +1655,16 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 ; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff ; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 ; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 ; GFX11-SDAG-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -1665,7 +1682,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned: @@ -1674,12 +1692,13 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX11-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 @@ -1699,7 +1718,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, align 128, addrspace(5) @@ -1734,7 +1754,7 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1764,7 +1784,7 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1793,12 +1813,11 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned: @@ -1826,11 +1845,12 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, align 2, addrspace(5) @@ -1844,9 +1864,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s13, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-SDAG-NEXT: s_mov_b32 s14, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 ; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x3000 ; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-SDAG-NEXT: s_cbranch_execz .LBB14_6 @@ -1915,7 +1937,8 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xd000 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s14 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s13 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1924,9 +1947,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s13, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-GISEL-NEXT: s_mov_b32 s14, s34 ; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x3000 ; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-GISEL-NEXT: s_cbranch_execz .LBB14_6 @@ -1994,7 +2019,8 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xd000 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s14 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s13 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2003,9 +2029,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s7, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s8, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xc0 ; GFX11-SDAG-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_6 @@ -2079,9 +2107,9 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s8 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s7 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff40 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas: @@ -2089,9 +2117,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_mov_b32 s7, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s8, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo ; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xc0 ; GFX11-GISEL-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-GISEL-NEXT: s_cbranch_execz .LBB14_6 @@ -2162,8 +2192,9 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s8 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s7 -; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff40 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %n, 0 @@ -2189,9 +2220,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s11, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-SDAG-NEXT: s_mov_b32 s12, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 ; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x2000 ; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2243,7 +2276,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: .LBB15_8: ; %bb.2 ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xe000 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s12 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s11 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2253,9 +2287,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s11, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0 +; GFX9-GISEL-NEXT: s_mov_b32 s12, s34 ; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x2000 ; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2307,7 +2343,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: .LBB15_8: ; %bb.2 ; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xe000 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s12 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s11 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2317,9 +2354,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x80 ; GFX11-SDAG-NEXT: v_cmpx_ne_u32_e32 0, v0 ; GFX11-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2376,8 +2415,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: .LBB15_8: ; %bb.2 ; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xff80 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2386,9 +2425,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_mov_b32 s5, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s6, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x80 ; GFX11-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v0 ; GFX11-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2446,7 +2487,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: .LBB15_8: ; %bb.2 ; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xff80 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s6 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: @@ -2492,7 +2534,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2522,7 +2564,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2551,12 +2593,11 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: @@ -2584,11 +2625,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i16 %n, align 2, addrspace(5) store volatile i32 666, ptr addrspace(5) %alloca @@ -2621,7 +2663,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2650,7 +2692,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2677,12 +2719,11 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s32, -16 +; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64: @@ -2708,11 +2749,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_i32 s32, s32, -16 +; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i64 %n, align 2, addrspace(5) store volatile i32 666, ptr addrspace(5) %alloca diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir index 831e246426ba7..b5a9f02711016 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir @@ -2052,112 +2052,136 @@ machineFunctionInfo: body: | bb.0: ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX7: liveins: $sgpr4 + ; GFX7: liveins: $sgpr4, $sgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX7-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX7-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX7-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX8: liveins: $sgpr4 + ; GFX8: liveins: $sgpr4, $sgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX8-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX8-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX900: liveins: $sgpr4 + ; GFX900: liveins: $sgpr4, $sgpr5 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX900-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX900-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX90A: liveins: $sgpr4 + ; GFX90A: liveins: $sgpr4, $sgpr5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX90A-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX10: liveins: $sgpr4 + ; GFX10: liveins: $sgpr4, $sgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX10-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec - ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX940: liveins: $sgpr4 + ; GFX940: liveins: $sgpr4, $sgpr5 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX940-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX940-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX11: liveins: $sgpr4 + ; GFX11: liveins: $sgpr4, $sgpr5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX11-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec - ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX11-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc - ; GFX12: liveins: $sgpr4 + ; GFX12: liveins: $sgpr4, $sgpr5 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX12-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec - ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc + ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX12-NEXT: SI_RETURN implicit $vgpr0 renamable $vgpr0, renamable dead $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec @@ -2180,115 +2204,139 @@ machineFunctionInfo: body: | bb.0: ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX7: liveins: $sgpr4 + ; GFX7: liveins: $sgpr4, $sgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX7-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX7-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX8: liveins: $sgpr4 + ; GFX8: liveins: $sgpr4, $sgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX8-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX900: liveins: $sgpr4 + ; GFX900: liveins: $sgpr4, $sgpr5 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX900-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX90A: liveins: $sgpr4 + ; GFX90A: liveins: $sgpr4, $sgpr5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX90A-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX10: liveins: $sgpr4 + ; GFX10: liveins: $sgpr4, $sgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX10-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec ; GFX10-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX940: liveins: $sgpr4 + ; GFX940: liveins: $sgpr4, $sgpr5 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX940-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX11: liveins: $sgpr4 + ; GFX11: liveins: $sgpr4, $sgpr5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX11-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 ; ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live - ; GFX12: liveins: $sgpr4 + ; GFX12: liveins: $sgpr4, $sgpr5 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc + ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9 renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec @@ -2311,112 +2359,136 @@ machineFunctionInfo: body: | bb.0: ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX7: liveins: $sgpr4 + ; GFX7: liveins: $sgpr4, $sgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX7-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX7-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX7-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX8: liveins: $sgpr4 + ; GFX8: liveins: $sgpr4, $sgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $vcc_lo = S_MOV_B32 12288 ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec ; GFX8-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX8-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX900: liveins: $sgpr4 + ; GFX900: liveins: $sgpr4, $sgpr5 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX900-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX900-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX90A: liveins: $sgpr4 + ; GFX90A: liveins: $sgpr4, $sgpr5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX90A-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX10: liveins: $sgpr4 + ; GFX10: liveins: $sgpr4, $sgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX10-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec - ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX940: liveins: $sgpr4 + ; GFX940: liveins: $sgpr4, $sgpr5 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX940-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX11: liveins: $sgpr4 + ; GFX11: liveins: $sgpr4, $sgpr5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX11-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec - ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX11-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc - ; GFX12: liveins: $sgpr4 + ; GFX12: liveins: $sgpr4, $sgpr5 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX12-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec - ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc + ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX12-NEXT: SI_RETURN implicit $vgpr0 renamable $vgpr0, renamable dead $vcc = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec @@ -2439,115 +2511,139 @@ machineFunctionInfo: body: | bb.0: ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX7: liveins: $sgpr4 + ; GFX7: liveins: $sgpr4, $sgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX7-NEXT: $sgpr6 = S_MOV_B32 12288 ; GFX7-NEXT: $vgpr1, dead $sgpr6_sgpr7 = V_ADD_CO_U32_e64 killed $sgpr6, killed $vgpr1, 0, implicit $exec ; GFX7-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX8: liveins: $sgpr4 + ; GFX8: liveins: $sgpr4, $sgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $sgpr6 = S_MOV_B32 12288 ; GFX8-NEXT: $vgpr1, dead $sgpr6_sgpr7 = V_ADD_CO_U32_e64 killed $sgpr6, killed $vgpr1, 0, implicit $exec ; GFX8-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX900: liveins: $sgpr4 + ; GFX900: liveins: $sgpr4, $sgpr5 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX900-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX90A: liveins: $sgpr4 + ; GFX90A: liveins: $sgpr4, $sgpr5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; GFX90A-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX10: liveins: $sgpr4 + ; GFX10: liveins: $sgpr4, $sgpr5 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX940: liveins: $sgpr4 + ; GFX940: liveins: $sgpr4, $sgpr5 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec - ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX11: liveins: $sgpr4 + ; GFX11: liveins: $sgpr4, $sgpr5 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX11-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live - ; GFX12: liveins: $sgpr4 + ; GFX12: liveins: $sgpr4, $sgpr5 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec - ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc + ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir index de198941b565e..b7a5cf963138f 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir @@ -1708,42 +1708,51 @@ machineFunctionInfo: body: | bb.0: ; MUBUF-LABEL: name: v_add_u32_e64_imm_fi_vop3_literal_error - ; MUBUF: liveins: $sgpr4 + ; MUBUF: liveins: $sgpr4, $sgpr5 ; MUBUF-NEXT: {{ $}} ; MUBUF-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; MUBUF-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; MUBUF-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; MUBUF-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 64, killed $vgpr1, 0, implicit $exec - ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; MUBUF-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; MUBUF-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 ; ; MUBUFW32-LABEL: name: v_add_u32_e64_imm_fi_vop3_literal_error - ; MUBUFW32: liveins: $sgpr4 + ; MUBUFW32: liveins: $sgpr4, $sgpr5 ; MUBUFW32-NEXT: {{ $}} ; MUBUFW32-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; MUBUFW32-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 262112, implicit-def $scc ; MUBUFW32-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294705152, implicit-def dead $scc + ; MUBUFW32-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; MUBUFW32-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUFW32-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1048576, implicit-def dead $scc ; MUBUFW32-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 5, $sgpr33, implicit $exec ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 12352, killed $vgpr1, 0, implicit $exec - ; MUBUFW32-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1048576, implicit-def dead $scc + ; MUBUFW32-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; MUBUFW32-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; MUBUFW32-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 ; ; FLATSCRW64-LABEL: name: v_add_u32_e64_imm_fi_vop3_literal_error - ; FLATSCRW64: liveins: $sgpr4 + ; FLATSCRW64: liveins: $sgpr4, $sgpr5 ; FLATSCRW64-NEXT: {{ $}} ; FLATSCRW64-NEXT: $sgpr4 = frame-setup COPY $sgpr33 ; FLATSCRW64-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc ; FLATSCRW64-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc + ; FLATSCRW64-NEXT: $sgpr5 = frame-setup COPY $sgpr34 + ; FLATSCRW64-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; FLATSCRW64-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; FLATSCRW64-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc - ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 64, killed $sgpr5, 0, implicit $exec - ; FLATSCRW64-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc + ; FLATSCRW64-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 64, killed $sgpr6, 0, implicit $exec + ; FLATSCRW64-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; FLATSCRW64-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; FLATSCRW64-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 renamable $vgpr0 = V_ADD_U32_e64 64, %stack.1, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll index 8e0750195b3b4..c4063aecb6ed7 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll @@ -18,8 +18,10 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 { ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: s_addk_i32 s32, 0x3000 ; GCN-NEXT: v_writelane_b32 v42, s16, 2 +; GCN-NEXT: v_writelane_b32 v42, s34, 3 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x3000 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 @@ -55,11 +57,12 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 { ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: v_readlane_b32 s31, v42, 1 ; GCN-NEXT: v_readlane_b32 s30, v42, 0 +; GCN-NEXT: s_mov_b32 s32, s34 ; GCN-NEXT: v_readlane_b32 s4, v42, 2 +; GCN-NEXT: v_readlane_b32 s34, v42, 3 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xd000 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll index 3922b5404d778..6684262f5c976 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -28,11 +28,11 @@ define void @callee_with_stack_and_call() #0 { ; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 +; SPILL-TO-VGPR-NEXT: s_mov_b32 s32, s33 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v40, 2 ; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1 ; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] -; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00 ; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s4 ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; SPILL-TO-VGPR-NEXT: s_setpc_b64 s[30:31] @@ -87,7 +87,7 @@ define void @callee_with_stack_and_call() #0 { ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xf800 +; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s32, s33 ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: v_readfirstlane_b32 s4, v0 ; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 6fd2c5a1267fb..32f255df82499 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -1680,11 +1680,11 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1710,11 +1710,11 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2092,11 +2092,11 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2134,11 +2134,11 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll index bb0e116cb4d32..1ad365df2e8a8 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -74,10 +74,10 @@ define amdgpu_gfx void @gfx_func() { ; SDAG-NEXT: v_readlane_b32 s6, v40, 2 ; SDAG-NEXT: v_readlane_b32 s5, v40, 1 ; SDAG-NEXT: v_readlane_b32 s4, v40, 0 +; SDAG-NEXT: s_mov_b32 s32, s33 ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; SDAG-NEXT: s_mov_b64 exec, s[34:35] -; SDAG-NEXT: s_addk_i32 s32, 0xfc00 ; SDAG-NEXT: s_mov_b32 s33, s36 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -151,10 +151,10 @@ define amdgpu_gfx void @gfx_func() { ; GISEL-NEXT: v_readlane_b32 s6, v40, 2 ; GISEL-NEXT: v_readlane_b32 s5, v40, 1 ; GISEL-NEXT: v_readlane_b32 s4, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[34:35] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s36 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 5ccbc85f46dd4..2e3ca34af4c74 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -142,11 +142,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -171,12 +171,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -201,11 +201,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -230,12 +230,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -265,11 +265,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -296,12 +296,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -327,11 +327,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -358,12 +358,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -394,11 +394,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -425,12 +425,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -456,11 +456,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -487,12 +487,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -520,11 +520,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -548,12 +548,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -577,11 +577,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -605,12 +605,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -638,11 +638,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -667,12 +667,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -697,11 +697,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -726,12 +726,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -760,11 +760,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -789,12 +789,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -819,11 +819,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -848,12 +848,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -881,11 +881,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -909,12 +909,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -938,11 +938,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -966,12 +966,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -999,11 +999,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1028,12 +1028,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1058,11 +1058,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1087,12 +1087,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1121,11 +1121,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1150,12 +1150,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1180,11 +1180,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1209,12 +1209,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1242,11 +1242,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1270,12 +1270,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1299,11 +1299,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1327,12 +1327,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1360,11 +1360,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1389,12 +1389,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1418,11 +1418,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1447,12 +1447,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1481,11 +1481,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1511,12 +1511,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1542,11 +1542,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1572,12 +1572,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1608,11 +1608,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1639,12 +1639,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1669,11 +1669,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1700,12 +1700,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1736,11 +1736,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1768,12 +1768,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1799,11 +1799,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1831,12 +1831,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -1872,11 +1872,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1906,12 +1906,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1938,11 +1938,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1972,12 +1972,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2006,11 +2006,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2034,12 +2034,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2063,11 +2063,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2091,12 +2091,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2123,11 +2123,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2151,12 +2151,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2180,11 +2180,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2208,12 +2208,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2241,11 +2241,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2270,12 +2270,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2299,11 +2299,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2328,12 +2328,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2362,11 +2362,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2392,12 +2392,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2422,11 +2422,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2452,12 +2452,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2488,11 +2488,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2520,12 +2520,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2551,11 +2551,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2583,12 +2583,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2616,11 +2616,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2645,12 +2645,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2674,11 +2674,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2703,12 +2703,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2738,11 +2738,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2769,12 +2769,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2799,11 +2799,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2830,12 +2830,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -2867,11 +2867,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2900,12 +2900,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2931,11 +2931,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2964,12 +2964,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3001,11 +3001,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3034,12 +3034,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3068,11 +3068,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3101,12 +3101,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3139,11 +3139,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3172,12 +3172,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3205,11 +3205,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3238,12 +3238,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3277,11 +3277,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3311,12 +3311,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3345,11 +3345,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3379,12 +3379,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3420,11 +3420,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3456,12 +3456,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3492,11 +3492,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3528,12 +3528,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3572,11 +3572,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3611,12 +3611,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3649,11 +3649,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3688,12 +3688,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -3764,11 +3764,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3836,12 +3836,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -3902,11 +3902,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3974,12 +3974,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4016,11 +4016,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4052,12 +4052,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4089,11 +4089,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4125,12 +4125,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4173,11 +4173,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4209,6 +4209,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[40:41], v0, off @@ -4219,7 +4220,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4253,8 +4253,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v42, 2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b16 v[40:41], v0, off ; GFX11-NEXT: s_clause 0x1 @@ -4263,7 +4263,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4295,6 +4294,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: global_store_short v[40:41], v0, off @@ -4305,7 +4305,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4351,11 +4350,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4390,6 +4389,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: global_store_byte v[3:4], v2, off ; GFX10-NEXT: global_store_short v[40:41], v0, off @@ -4400,7 +4400,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4437,6 +4436,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b8 v[0:1], v2, off @@ -4447,7 +4447,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4482,6 +4481,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: global_store_byte v[3:4], v2, off ; GFX10-SCRATCH-NEXT: global_store_short v[40:41], v0, off @@ -4492,7 +4492,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4539,11 +4538,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4577,9 +4576,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-NEXT: v_readlane_b32 s34, v42, 2 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[40:41], v0, off ; GFX10-NEXT: s_clause 0x1 @@ -4589,7 +4589,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4627,9 +4626,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4641,7 +4641,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4675,9 +4674,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v0, off ; GFX10-SCRATCH-NEXT: s_clause 0x1 @@ -4687,7 +4687,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4739,11 +4738,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4779,11 +4778,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-NEXT: v_readlane_b32 s34, v42, 2 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v0, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_byte v[0:1], v4, off ; GFX10-NEXT: global_store_dword v[40:41], v2, off @@ -4794,7 +4794,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4834,9 +4833,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX11-NEXT: v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v3, 16, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -4851,7 +4851,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4887,11 +4886,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v4, off ; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v2, off @@ -4902,7 +4902,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -4959,11 +4958,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5008,8 +5007,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-NEXT: v_readlane_b32 s34, v42, 2 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dwordx2 v[40:41], v[0:1], off ; GFX10-NEXT: s_clause 0x1 @@ -5019,7 +5019,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5072,8 +5071,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX11-NEXT: global_store_b64 v[40:41], v[0:1], off @@ -5083,7 +5083,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5128,8 +5127,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: global_store_dwordx2 v[40:41], v[0:1], off ; GFX10-SCRATCH-NEXT: s_clause 0x1 @@ -5139,7 +5139,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -5263,11 +5262,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v44, 1 ; GFX9-NEXT: v_readlane_b32 s30, v44, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v44, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5386,12 +5385,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v44, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5537,11 +5536,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v44, 1 ; GFX11-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v44, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5660,12 +5659,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v44, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v44, off, s33 offset:16 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -5696,11 +5695,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5724,12 +5723,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5753,11 +5752,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5781,12 +5780,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -5814,11 +5813,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5842,12 +5841,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5871,11 +5870,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5899,12 +5898,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -5932,11 +5931,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5960,12 +5959,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5989,11 +5988,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6017,12 +6016,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6051,11 +6050,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6080,12 +6079,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6109,11 +6108,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6138,12 +6137,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6171,11 +6170,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6200,12 +6199,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6230,11 +6229,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6259,12 +6258,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6291,11 +6290,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6319,12 +6318,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6348,11 +6347,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6376,12 +6375,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6410,11 +6409,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6439,12 +6438,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6469,11 +6468,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6498,12 +6497,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6530,11 +6529,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6558,12 +6557,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6587,11 +6586,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6615,12 +6614,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6648,11 +6647,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6676,12 +6675,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6705,11 +6704,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6733,12 +6732,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6767,11 +6766,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6796,12 +6795,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6825,11 +6824,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6854,12 +6853,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6888,11 +6887,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6918,12 +6917,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6948,11 +6947,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6978,12 +6977,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7013,11 +7012,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7044,12 +7043,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7074,11 +7073,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7105,12 +7104,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7137,11 +7136,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7165,12 +7164,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7194,11 +7193,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7222,12 +7221,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7258,11 +7257,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7289,12 +7288,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7319,11 +7318,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7350,12 +7349,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7386,11 +7385,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7418,12 +7417,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7449,11 +7448,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7481,12 +7480,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7517,11 +7516,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7550,12 +7549,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7584,11 +7583,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7617,12 +7616,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7658,11 +7657,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7693,12 +7692,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7725,11 +7724,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7760,12 +7759,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7798,11 +7797,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7833,12 +7832,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -7869,11 +7868,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -7904,12 +7903,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -7949,11 +7948,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7988,12 +7987,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8028,11 +8027,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8067,12 +8066,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8115,11 +8114,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8157,12 +8156,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8199,11 +8198,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8241,12 +8240,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8284,11 +8283,11 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8321,12 +8320,12 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8358,11 +8357,11 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8395,12 +8394,12 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8432,11 +8431,11 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8465,12 +8464,12 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8499,11 +8498,11 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8532,12 +8531,12 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8570,11 +8569,11 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8602,12 +8601,12 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8635,11 +8634,11 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8667,12 +8666,12 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8712,6 +8711,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off @@ -8721,7 +8721,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8754,6 +8753,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off @@ -8764,7 +8764,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -8795,6 +8794,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX11-NEXT: scratch_load_b32 v1, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc @@ -8804,7 +8804,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -8837,6 +8836,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-SCRATCH-NEXT: scratch_load_dword v1, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v0, off @@ -8847,7 +8847,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -8909,11 +8908,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -8960,12 +8959,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9008,11 +9007,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9059,12 +9058,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9158,10 +9157,10 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9251,11 +9250,11 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9341,10 +9340,10 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX11-NEXT: v_readlane_b32 s34, v40, 2 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9431,11 +9430,11 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:24 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s4 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9466,11 +9465,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9495,12 +9494,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9525,11 +9524,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9554,12 +9553,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9588,11 +9587,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9618,12 +9617,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9649,11 +9648,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9679,12 +9678,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9713,11 +9712,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9743,12 +9742,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9774,11 +9773,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9804,12 +9803,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9838,11 +9837,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9868,12 +9867,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -9899,11 +9898,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -9929,12 +9928,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -9966,11 +9965,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9999,12 +9998,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10033,11 +10032,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10066,12 +10065,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10107,11 +10106,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10144,12 +10143,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10182,11 +10181,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10219,12 +10218,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10263,11 +10262,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10302,12 +10301,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10342,11 +10341,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10381,12 +10380,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10428,11 +10427,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 8 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10471,12 +10470,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 8 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10515,11 +10514,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 8 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10558,12 +10557,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10614,11 +10613,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10663,12 +10662,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10713,11 +10712,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10762,12 +10761,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10798,11 +10797,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10828,12 +10827,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10859,11 +10858,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -10889,12 +10888,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -10923,11 +10922,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -10953,12 +10952,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -10984,11 +10983,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11014,12 +11013,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11051,11 +11050,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11084,12 +11083,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11118,11 +11117,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11151,12 +11150,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11191,11 +11190,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11227,12 +11226,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 5 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11264,11 +11263,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 5 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11300,12 +11299,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11346,11 +11345,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 7 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11388,12 +11387,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 7 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11431,11 +11430,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 7 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11473,12 +11472,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11510,11 +11509,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11543,12 +11542,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11577,11 +11576,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11610,12 +11609,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11653,11 +11652,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11692,12 +11691,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11732,11 +11731,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11771,12 +11770,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11820,11 +11819,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 8 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -11865,12 +11864,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 8 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -11911,11 +11910,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 8 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -11956,12 +11955,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -11990,11 +11989,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12020,12 +12019,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12051,11 +12050,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12081,12 +12080,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12118,11 +12117,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12150,12 +12149,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12183,11 +12182,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12215,12 +12214,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12252,11 +12251,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12284,12 +12283,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12317,11 +12316,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12349,12 +12348,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12387,11 +12386,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12420,12 +12419,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12454,11 +12453,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12487,12 +12486,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12524,11 +12523,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12557,12 +12556,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12591,11 +12590,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12624,12 +12623,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12660,11 +12659,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12692,12 +12691,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12725,11 +12724,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12757,12 +12756,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12795,11 +12794,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12828,12 +12827,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12862,11 +12861,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -12895,12 +12894,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -12929,11 +12928,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -12959,12 +12958,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -12990,11 +12989,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13020,12 +13019,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13057,11 +13056,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13089,12 +13088,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13122,11 +13121,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13154,12 +13153,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13192,11 +13191,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13225,12 +13224,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13259,11 +13258,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13292,12 +13291,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13332,11 +13331,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13368,12 +13367,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 5 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13405,11 +13404,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 5 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13441,12 +13440,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13484,11 +13483,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13523,12 +13522,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13563,11 +13562,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13602,12 +13601,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13642,11 +13641,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13678,12 +13677,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13715,11 +13714,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13751,12 +13750,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13795,11 +13794,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13834,12 +13833,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13874,11 +13873,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13913,12 +13912,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13959,11 +13958,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 7 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14001,12 +14000,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 7 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -14044,11 +14043,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 7 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -14086,12 +14085,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -14136,11 +14135,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14182,12 +14181,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -14229,11 +14228,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -14275,12 +14274,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -14332,11 +14331,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14383,12 +14382,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -14435,11 +14434,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -14486,12 +14485,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -14552,11 +14551,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 18 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14614,12 +14613,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 18 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -14677,11 +14676,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 18 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -14739,12 +14738,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 18 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -14851,11 +14850,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 28 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14958,12 +14957,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 28 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -15061,11 +15060,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 28 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -15165,12 +15164,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 28 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -15282,11 +15281,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 28 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -15394,12 +15393,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 28 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -15501,11 +15500,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 28 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -15611,12 +15610,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 28 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -15651,11 +15650,11 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -15685,12 +15684,12 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -15715,11 +15714,11 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -15745,12 +15744,12 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -15817,11 +15816,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -15884,12 +15883,12 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -15931,11 +15930,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -15995,12 +15994,12 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16087,11 +16086,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16162,12 +16161,12 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -16213,11 +16212,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -16283,12 +16282,12 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16371,11 +16370,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16446,12 +16445,12 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -16503,11 +16502,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -16573,12 +16572,12 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16613,11 +16612,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16640,12 +16639,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -16668,11 +16667,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -16695,12 +16694,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16727,11 +16726,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16754,12 +16753,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -16782,11 +16781,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -16809,12 +16808,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16841,11 +16840,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16868,12 +16867,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -16896,11 +16895,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -16923,12 +16922,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -16955,11 +16954,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -16982,12 +16981,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17010,11 +17009,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17037,12 +17036,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17069,11 +17068,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17096,12 +17095,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17124,11 +17123,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17151,12 +17150,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17183,11 +17182,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17210,12 +17209,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17238,11 +17237,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17265,12 +17264,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17297,11 +17296,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17324,12 +17323,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17352,11 +17351,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17379,12 +17378,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17411,11 +17410,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17438,12 +17437,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17466,11 +17465,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17493,12 +17492,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17525,11 +17524,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17552,12 +17551,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17580,11 +17579,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17607,12 +17606,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17639,11 +17638,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17666,12 +17665,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17694,11 +17693,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17721,12 +17720,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17753,11 +17752,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17780,12 +17779,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17808,11 +17807,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17835,12 +17834,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17867,11 +17866,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -17894,12 +17893,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -17922,11 +17921,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -17949,12 +17948,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -17981,11 +17980,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -18008,12 +18007,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -18036,11 +18035,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -18063,12 +18062,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -18095,11 +18094,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -18122,12 +18121,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -18150,11 +18149,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -18177,12 +18176,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll index a14e3d5673f82..4afc2fc972a28 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -30,11 +30,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -64,12 +64,12 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -99,11 +99,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -227,11 +227,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -264,12 +264,12 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -301,11 +301,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -343,11 +343,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: v_readlane_b32 s30, v41, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v41, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -380,12 +380,12 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: v_readlane_b32 s30, v41, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v41, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -418,11 +418,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: v_readlane_b32 s30, v41, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v41, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -461,11 +461,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -498,12 +498,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -536,11 +536,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -578,11 +578,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -615,12 +615,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -653,11 +653,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -693,11 +693,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: v_readlane_b32 s30, v41, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v41, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -728,12 +728,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: v_readlane_b32 s30, v41, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v41, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -763,11 +763,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: v_readlane_b32 s30, v41, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v41, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -911,11 +911,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -938,12 +938,12 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -966,11 +966,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -996,11 +996,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1023,12 +1023,12 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1051,11 +1051,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1090,11 +1090,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1126,12 +1126,12 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1163,11 +1163,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1213,11 +1213,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX9-NEXT: v_readlane_b32 s31, v41, 2 ; GFX9-NEXT: v_readlane_b32 s30, v41, 1 ; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v41, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1258,12 +1258,12 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX10-NEXT: v_readlane_b32 s31, v41, 2 ; GFX10-NEXT: v_readlane_b32 s30, v41, 1 ; GFX10-NEXT: v_readlane_b32 s4, v41, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v41, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1303,11 +1303,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX11-NEXT: v_readlane_b32 s31, v41, 2 ; GFX11-NEXT: v_readlane_b32 s30, v41, 1 ; GFX11-NEXT: v_readlane_b32 s4, v41, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v41, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index c3ab9c23d1950..6384fdba7a45a 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -36,10 +36,10 @@ define amdgpu_gfx void @call_i1() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -61,11 +61,11 @@ define amdgpu_gfx void @call_i1() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -87,10 +87,10 @@ define amdgpu_gfx void @call_i1() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -132,10 +132,10 @@ define amdgpu_gfx void @call_i16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -157,11 +157,11 @@ define amdgpu_gfx void @call_i16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -183,10 +183,10 @@ define amdgpu_gfx void @call_i16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -228,10 +228,10 @@ define amdgpu_gfx void @call_2xi16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v1, 1 ; GFX9-NEXT: v_readlane_b32 s30, v1, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -253,11 +253,11 @@ define amdgpu_gfx void @call_2xi16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v1, 1 ; GFX10-NEXT: v_readlane_b32 s30, v1, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -279,10 +279,10 @@ define amdgpu_gfx void @call_2xi16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v1, 1 ; GFX11-NEXT: v_readlane_b32 s30, v1, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -332,10 +332,10 @@ define amdgpu_gfx void @call_3xi16() #0 { ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -357,11 +357,11 @@ define amdgpu_gfx void @call_3xi16() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -383,10 +383,10 @@ define amdgpu_gfx void @call_3xi16() #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v2, 1 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -746,10 +746,10 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v100, 1 ; GFX9-NEXT: v_readlane_b32 s30, v100, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_addk_i32 s32, 0xdc00 ; GFX9-NEXT: s_mov_b32 s33, s36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -836,11 +836,11 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:124 ; GFX10-NEXT: v_readlane_b32 s31, v100, 1 ; GFX10-NEXT: v_readlane_b32 s30, v100, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_addk_i32 s32, 0xee00 ; GFX10-NEXT: s_mov_b32 s33, s36 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -927,10 +927,10 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:124 ; GFX11-NEXT: v_readlane_b32 s31, v100, 1 ; GFX11-NEXT: v_readlane_b32 s30, v100, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v100, off, s33 offset:128 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xff70 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2130,61 +2130,67 @@ define amdgpu_gfx void @call_512xi32() #0 { ; GFX9-LABEL: call_512xi32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s36, s33 +; GFX9-NEXT: s_mov_b32 s35, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x1ffc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xfffe0000 -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 -; GFX9-NEXT: s_mov_b32 s35, return_512xi32@abs32@hi -; GFX9-NEXT: s_mov_b32 s34, return_512xi32@abs32@lo +; GFX9-NEXT: s_mov_b32 s37, return_512xi32@abs32@hi +; GFX9-NEXT: s_mov_b32 s36, return_512xi32@abs32@lo ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX9-NEXT: s_mov_b32 s38, s34 +; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_add_i32 s32, s32, 0x60000 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: s_mov_b32 s34, s38 +; GFX9-NEXT: s_xor_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_add_i32 s32, s32, 0xfffa0000 -; GFX9-NEXT: s_mov_b32 s33, s36 +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: s_mov_b32 s33, s35 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: call_512xi32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s36, s33 +; GFX10-NEXT: s_mov_b32 s35, s33 ; GFX10-NEXT: s_add_i32 s33, s32, 0xffe0 ; GFX10-NEXT: s_and_b32 s33, s33, 0xffff0000 -; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s36, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s36 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: s_mov_b32 s35, return_512xi32@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, return_512xi32@abs32@lo +; GFX10-NEXT: s_mov_b32 s37, return_512xi32@abs32@hi +; GFX10-NEXT: s_mov_b32 s36, return_512xi32@abs32@lo +; GFX10-NEXT: s_mov_b32 s38, s34 +; GFX10-NEXT: s_mov_b32 s34, s32 ; GFX10-NEXT: s_add_i32 s32, s32, 0x30000 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 +; GFX10-NEXT: s_mov_b32 s32, s34 +; GFX10-NEXT: s_mov_b32 s34, s38 +; GFX10-NEXT: s_xor_saveexec_b32 s36, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_add_i32 s32, s32, 0xfffd0000 -; GFX10-NEXT: s_mov_b32 s33, s36 +; GFX10-NEXT: s_mov_b32 exec_lo, s36 +; GFX10-NEXT: s_mov_b32 s33, s35 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: call_512xi32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s34, s33 +; GFX11-NEXT: s_mov_b32 s35, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 0x7ff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffff800 @@ -2195,17 +2201,20 @@ define amdgpu_gfx void @call_512xi32() #0 { ; GFX11-NEXT: v_mov_b32_e32 v0, s33 ; GFX11-NEXT: s_mov_b32 s1, return_512xi32@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, return_512xi32@abs32@lo +; GFX11-NEXT: s_mov_b32 s36, s34 +; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_addk_i32 s32, 0x1800 ; GFX11-NEXT: v_writelane_b32 v5, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v5, 1 ; GFX11-NEXT: v_readlane_b32 s30, v5, 0 +; GFX11-NEXT: s_mov_b32 s32, s34 +; GFX11-NEXT: s_mov_b32 s34, s36 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v5, off, s33 offset:2048 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xe800 -; GFX11-NEXT: s_mov_b32 s33, s34 +; GFX11-NEXT: s_mov_b32 s33, s35 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -2619,12 +2628,14 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-LABEL: call_72xi32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s36, s33 +; GFX9-NEXT: s_mov_b32 s35, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x7fc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xffff8000 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: s_mov_b32 s38, s34 +; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_add_i32 s32, s32, 0x28000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill @@ -2685,8 +2696,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_writelane_b32 v63, s30, 0 -; GFX9-NEXT: s_mov_b32 s35, return_72xi32@abs32@hi -; GFX9-NEXT: s_mov_b32 s34, return_72xi32@abs32@lo +; GFX9-NEXT: s_mov_b32 s37, return_72xi32@abs32@hi +; GFX9-NEXT: s_mov_b32 s36, return_72xi32@abs32@lo ; GFX9-NEXT: v_add_u32_e32 v0, 0x200, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2720,7 +2731,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: v_mov_b32_e32 v30, 0 ; GFX9-NEXT: v_mov_b32_e32 v31, 0 ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644 @@ -2859,7 +2870,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -2877,25 +2888,28 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: s_mov_b32 s34, s38 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-NEXT: s_add_i32 s32, s32, 0xfffd8000 -; GFX9-NEXT: s_mov_b32 s33, s36 +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: s_mov_b32 s33, s35 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: call_72xi32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s36, s33 +; GFX10-NEXT: s_mov_b32 s35, s33 ; GFX10-NEXT: s_add_i32 s33, s32, 0x3fe0 ; GFX10-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: s_or_saveexec_b32 s36, -1 ; GFX10-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_mov_b32 exec_lo, s36 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s38, s34 +; GFX10-NEXT: s_mov_b32 s34, s32 ; GFX10-NEXT: s_add_i32 s32, s32, 0x14000 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill @@ -2912,8 +2926,6 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v63, s30, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 @@ -2956,11 +2968,13 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: v_writelane_b32 v63, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -2987,10 +3001,10 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_mov_b32_e32 v29, 0 ; GFX10-NEXT: v_mov_b32_e32 v30, 0 ; GFX10-NEXT: v_mov_b32_e32 v31, 0 -; GFX10-NEXT: s_mov_b32 s35, return_72xi32@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, return_72xi32@abs32@lo +; GFX10-NEXT: s_mov_b32 s37, return_72xi32@abs32@hi +; GFX10-NEXT: s_mov_b32 s36, return_72xi32@abs32@lo ; GFX10-NEXT: v_writelane_b32 v63, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_clause 0x28 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:636 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 @@ -3133,7 +3147,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_clause 0xe ; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 @@ -3152,19 +3166,20 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; GFX10-NEXT: v_readlane_b32 s31, v63, 1 ; GFX10-NEXT: v_readlane_b32 s30, v63, 0 -; GFX10-NEXT: s_or_saveexec_b32 s34, -1 +; GFX10-NEXT: s_mov_b32 s32, s34 +; GFX10-NEXT: s_mov_b32 s34, s38 +; GFX10-NEXT: s_or_saveexec_b32 s36, -1 ; GFX10-NEXT: buffer_load_dword v63, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s34 -; GFX10-NEXT: s_add_i32 s32, s32, 0xfffec000 -; GFX10-NEXT: s_mov_b32 s33, s36 +; GFX10-NEXT: s_mov_b32 exec_lo, s36 +; GFX10-NEXT: s_mov_b32 s33, s35 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: call_72xi32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s34, s33 +; GFX11-NEXT: s_mov_b32 s35, s33 ; GFX11-NEXT: s_add_i32 s33, s32, 0x1ff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 @@ -3178,6 +3193,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s36, s34 +; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 ; GFX11-NEXT: s_clause 0xb ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44 @@ -3347,11 +3364,12 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:44 ; GFX11-NEXT: v_readlane_b32 s31, v60, 1 ; GFX11-NEXT: v_readlane_b32 s30, v60, 0 +; GFX11-NEXT: s_mov_b32 s32, s34 +; GFX11-NEXT: s_mov_b32 s34, s36 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:1600 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xf600 -; GFX11-NEXT: s_mov_b32 s33, s34 +; GFX11-NEXT: s_mov_b32 s33, s35 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/global-alias.ll b/llvm/test/CodeGen/AMDGPU/global-alias.ll index 334e6e2b617e0..d8df20eb69452 100644 --- a/llvm/test/CodeGen/AMDGPU/global-alias.ll +++ b/llvm/test/CodeGen/AMDGPU/global-alias.ll @@ -37,11 +37,11 @@ define void @bar() { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index da8aa54469835..55da485b91f67 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -193,11 +193,11 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -277,11 +277,11 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -368,11 +368,11 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -453,11 +453,11 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -543,11 +543,11 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -629,11 +629,11 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -729,11 +729,11 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 20 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -824,11 +824,11 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 20 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -930,10 +930,10 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1024,10 +1024,10 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s5 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1126,10 +1126,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s31, v41, 1 ; GCN-NEXT: v_readlane_b32 s30, v41, 0 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1224,10 +1224,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s31, v41, 1 ; GISEL-NEXT: v_readlane_b32 s30, v41, 0 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1328,10 +1328,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1424,10 +1424,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1522,10 +1522,10 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1615,10 +1615,10 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 5c5a769178dd9..ea3d57d127151 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -36,10 +36,10 @@ define void @f0() { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v4, 1 ; GFX11-NEXT: v_readlane_b32 s30, v4, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll index eef51acc4e12e..1f518386c63d5 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll @@ -41,11 +41,11 @@ define fastcc i32 @foo() { ; CHECK-NEXT: bb.2.DummyReturnBlock: ; CHECK-NEXT: $sgpr31 = V_READLANE_B32 $vgpr40, 1 ; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0 + ; CHECK-NEXT: $sgpr32 = S_MOV_B32 $sgpr33 ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr40, 2 ; CHECK-NEXT: $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr5 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADDK_I32 $sgpr32, -512, implicit-def dead $scc ; CHECK-NEXT: $sgpr33 = S_MOV_B32 killed $sgpr4 ; CHECK-NEXT: S_WAITCNT 16240 ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit undef $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 29fbb0bb1c6c9..0edc7cb01887b 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -112,10 +112,12 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 ; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 ; MUBUF-NEXT: v_add_u32_e32 v3, 0x3000, v3 +; MUBUF-NEXT: s_mov_b32 s6, s34 ; MUBUF-NEXT: v_add_u32_e32 v2, 64, v3 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0 ; MUBUF-NEXT: v_mov_b32_e32 v4, 0x2000 ; MUBUF-NEXT: s_mov_b32 s4, 0 +; MUBUF-NEXT: s_mov_b32 s34, s32 ; MUBUF-NEXT: s_add_i32 s32, s32, 0x200000 ; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], s33 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -141,7 +143,8 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_add_i32 s32, s32, 0xffe00000 +; MUBUF-NEXT: s_mov_b32 s32, s34 +; MUBUF-NEXT: s_mov_b32 s34, s6 ; MUBUF-NEXT: s_mov_b32 s33, s5 ; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 ; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc @@ -155,6 +158,8 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_mov_b32 s2, s33 ; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff ; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 +; FLATSCR-NEXT: s_mov_b32 s3, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s32 ; FLATSCR-NEXT: s_add_i32 s32, s32, 0x8000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR-NEXT: s_add_i32 s0, s33, 0x2000 @@ -179,7 +184,8 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_addk_i32 s32, 0x8000 +; FLATSCR-NEXT: s_mov_b32 s32, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s3 ; FLATSCR-NEXT: s_mov_b32 s33, s2 ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index b77c3a9bb532b..8e436b327cda1 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -439,6 +439,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_1-NEXT: s_mov_b32 s32, s33 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use alloca0 v0 @@ -455,7 +456,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_1-NEXT: s_mov_b32 s33, s5 ; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] @@ -473,6 +473,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_3-NEXT: s_mov_b32 s32, s33 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use alloca0 v0 @@ -488,7 +489,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80880 ; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_3-NEXT: s_mov_b32 s33, s5 ; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] @@ -513,7 +513,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_bitcmp1_b32 s0, 0 ; GFX11-NEXT: s_bitset0_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_mov_b32 s59, s0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc @@ -523,7 +523,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX11-NEXT: s_add_i32 s2, s33, 0x4044 ; GFX11-NEXT: scratch_load_b32 v1, off, s2 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xbf80 ; GFX11-NEXT: s_mov_b32 s33, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -558,11 +557,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: v_readlane_b32 s59, v1, 0 +; GFX12-NEXT: s_mov_b32 s32, s33 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0 ; GFX12-NEXT: s_mov_b32 s33, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -593,11 +592,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: v_readlane_b32 s59, v1, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: s_add_i32 s7, s33, 0x101100 ; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -626,11 +625,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s59, v1, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: s_add_i32 s7, s33, 0x101100 ; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX900-NEXT: s_mov_b32 s33, s6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -660,11 +659,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 { ; GFX940-NEXT: ; use s59, scc ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_readlane_b32 s59, v1, 0 +; GFX940-NEXT: s_mov_b32 s32, s33 ; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX940-NEXT: s_add_i32 s3, s33, 0x4044 ; GFX940-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload ; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_addk_i32 s32, 0xbf80 ; GFX940-NEXT: s_mov_b32 s33, s2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1027,6 +1026,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_1-NEXT: s_mov_b32 s32, s33 ; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1 ; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1 ; GFX10_1-NEXT: ;;#ASMSTART @@ -1038,7 +1038,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_1-NEXT: s_mov_b32 s33, s5 ; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] @@ -1056,6 +1055,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_3-NEXT: s_mov_b32 s32, s33 ; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1 ; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1 ; GFX10_3-NEXT: ;;#ASMSTART @@ -1066,7 +1066,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 ; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_3-NEXT: s_mov_b32 s33, s5 ; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] @@ -1084,9 +1083,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo ; GFX11-NEXT: v_writelane_b32 v0, s59, 0 ; GFX11-NEXT: s_addc_u32 s0, s33, 64 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_bitcmp1_b32 s0, 0 ; GFX11-NEXT: s_bitset0_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s59, s0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc @@ -1096,7 +1096,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 ; GFX11-NEXT: scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: s_addk_i32 s32, 0xbf80 ; GFX11-NEXT: s_mov_b32 s33, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1122,11 +1121,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: v_readlane_b32 s59, v0, 0 +; GFX12-NEXT: s_mov_b32 s32, s33 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0 ; GFX12-NEXT: s_mov_b32 s33, s1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1152,11 +1151,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: v_readlane_b32 s59, v0, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: s_add_i32 s7, s33, 0x101000 ; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1180,11 +1179,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s59, v0, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: s_add_i32 s7, s33, 0x101000 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX900-NEXT: s_mov_b32 s33, s6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1209,11 +1208,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp ; GFX940-NEXT: ; use s59, scc ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_readlane_b32 s59, v0, 0 +; GFX940-NEXT: s_mov_b32 s32, s33 ; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX940-NEXT: s_add_i32 s3, s33, 0x4040 ; GFX940-NEXT: scratch_load_dword v0, off, s3 ; 4-byte Folded Reload ; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_addk_i32 s32, 0xbf80 ; GFX940-NEXT: s_mov_b32 s33, s2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1236,6 +1235,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_1-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_1-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_1-NEXT: s_lshr_b32 s59, s33, 5 +; GFX10_1-NEXT: s_mov_b32 s32, s33 ; GFX10_1-NEXT: s_add_i32 s59, s59, 64 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59 @@ -1246,7 +1246,6 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_1-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s5 -; GFX10_1-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_1-NEXT: s_mov_b32 s33, s4 ; GFX10_1-NEXT: s_waitcnt vmcnt(0) ; GFX10_1-NEXT: s_setpc_b64 s[30:31] @@ -1263,6 +1262,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_3-NEXT: v_writelane_b32 v0, s59, 0 ; GFX10_3-NEXT: s_add_i32 s32, s32, 0x81000 ; GFX10_3-NEXT: s_lshr_b32 s59, s33, 5 +; GFX10_3-NEXT: s_mov_b32 s32, s33 ; GFX10_3-NEXT: s_add_i32 s59, s59, 64 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59 @@ -1272,7 +1272,6 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX10_3-NEXT: s_add_i32 s6, s33, 0x80800 ; GFX10_3-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX10_3-NEXT: s_mov_b32 exec_lo, s5 -; GFX10_3-NEXT: s_add_i32 s32, s32, 0xfff7f000 ; GFX10_3-NEXT: s_mov_b32 s33, s4 ; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] @@ -1289,7 +1288,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX11-NEXT: v_writelane_b32 v0, s59, 0 ; GFX11-NEXT: s_addk_i32 s32, 0x4080 ; GFX11-NEXT: s_add_i32 s1, s33, 64 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_mov_b32 s59, s1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59 @@ -1299,7 +1298,6 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX11-NEXT: s_add_i32 s2, s33, 0x4040 ; GFX11-NEXT: scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xbf80 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1318,18 +1316,17 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_writelane_b32 v0, s59, 0 -; GFX12-NEXT: s_mov_b32 s59, s33 ; GFX12-NEXT: s_addk_co_i32 s32, 0x4040 +; GFX12-NEXT: s_mov_b32 s59, s33 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s32, s33 ; GFX12-NEXT: v_readlane_b32 s59, v0, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0 ; GFX12-NEXT: s_mov_b32 s33, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1352,11 +1349,11 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX8-NEXT: ; use s59 ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: v_readlane_b32 s59, v0, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX8-NEXT: s_add_i32 s5, s33, 0x101000 ; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX8-NEXT: s_mov_b32 s33, s4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1378,11 +1375,11 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX900-NEXT: ; use s59 ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s59, v0, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX900-NEXT: s_add_i32 s5, s33, 0x101000 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[6:7] -; GFX900-NEXT: s_add_i32 s32, s32, 0xffefe000 ; GFX900-NEXT: s_mov_b32 s33, s4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1404,11 +1401,11 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() ; GFX940-NEXT: ; use s59 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_readlane_b32 s59, v0, 0 +; GFX940-NEXT: s_mov_b32 s32, s33 ; GFX940-NEXT: s_xor_saveexec_b64 s[2:3], -1 ; GFX940-NEXT: s_add_i32 s1, s33, 0x4040 ; GFX940-NEXT: scratch_load_dword v0, off, s1 ; 4-byte Folded Reload ; GFX940-NEXT: s_mov_b64 exec, s[2:3] -; GFX940-NEXT: s_addk_i32 s32, 0xbf80 ; GFX940-NEXT: s_mov_b32 s33, s0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 299bbdac60091..4bc7711f2f839 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -229,11 +229,11 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9-NEXT: v_readlane_b32 s34, v43, 2 ; GFX9-NEXT: v_readlane_b32 s31, v43, 1 ; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v43, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index 0e750d879ac94..2d853212166e9 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -12,7 +12,7 @@ define hidden fastcc void @callee_has_fp() #1 { ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffe00 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) @@ -50,10 +50,10 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -192,10 +192,10 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -226,10 +226,10 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v2, 1 ; CHECK-NEXT: v_readlane_b32 s30, v2, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s19 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll index 64a94a5ee0e70..593f40fd1b25e 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -24,12 +24,12 @@ declare void @external_void_func_i32(i32) #0 ; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s30, v40, 0 +; GCN: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -44,7 +44,6 @@ define void @test_func_call_external_void_func_i32_imm() #0 { ; GCN-DAG: s_addk_i32 s32, 0x1400{{$}} ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: ; GCN: s_swappc_b64 -; GCN: s_addk_i32 s32, 0xec00{{$}} ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { %alloca = alloca [16 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll index 9999cb9173b5d..25b7b043fc6b6 100644 --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -38,12 +38,11 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { ; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1 -; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index 2bd60e869f843..fb14f1844427e 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -246,7 +246,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 +; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_mov_b32 s33, s7 ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; @@ -280,7 +280,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_add_i32 s32, s32, -16 +; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_mov_b32 s33, s3 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -316,8 +316,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MUBUF-NEXT: s_mov_b32 s7, s33 ; MUBUF-NEXT: s_add_i32 s33, s32, 0xfc0 +; MUBUF-NEXT: s_mov_b32 s8, s34 ; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfffff000 +; MUBUF-NEXT: s_mov_b32 s34, s32 ; MUBUF-NEXT: s_addk_i32 s32, 0x2000 ; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc ; MUBUF-NEXT: s_cbranch_execz .LBB3_2 @@ -341,7 +343,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_addk_i32 s32, 0xe000 +; MUBUF-NEXT: s_mov_b32 s32, s34 +; MUBUF-NEXT: s_mov_b32 s34, s8 ; MUBUF-NEXT: s_mov_b32 s33, s7 ; MUBUF-NEXT: s_setpc_b64 s[30:31] ; @@ -350,8 +353,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: s_mov_b32 s3, s33 ; FLATSCR-NEXT: s_add_i32 s33, s32, 63 +; FLATSCR-NEXT: s_mov_b32 s4, s34 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; FLATSCR-NEXT: s_andn2_b32 s33, s33, 63 +; FLATSCR-NEXT: s_mov_b32 s34, s32 ; FLATSCR-NEXT: s_addk_i32 s32, 0x80 ; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; FLATSCR-NEXT: s_cbranch_execz .LBB3_2 @@ -373,7 +378,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_addk_i32 s32, 0xff80 +; FLATSCR-NEXT: s_mov_b32 s32, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s4 ; FLATSCR-NEXT: s_mov_b32 s33, s3 ; FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir index a204866170759..ba6524caf668d 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir @@ -37,6 +37,8 @@ body: | ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc @@ -50,12 +52,13 @@ body: | ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; CHECK-NEXT: $sgpr33 = COPY $sgpr4 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -86,11 +89,17 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr - ; CHECK: liveins: $sgpr29, $vgpr1 + ; CHECK: liveins: $sgpr29, $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2 + ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc @@ -100,11 +109,16 @@ body: | ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc - ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33 + ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33 ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31 + ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc + ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr29 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -135,21 +149,28 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr_64 - ; CHECK: liveins: $sgpr28, $vgpr1 + ; CHECK: liveins: $sgpr28, $sgpr29, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr28 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr34 + ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK-NEXT: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr29 = S_ADD_I32 killed $sgpr29, 8192, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr29 - ; CHECK-NEXT: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK-NEXT: $sgpr29 = S_ADD_I32 killed $sgpr29, 16384, implicit-def $scc - ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr29 + ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33 + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc + ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33 + ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc + ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr29 ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr28 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -179,11 +200,13 @@ body: | liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_prefer_vcc - ; CHECK: liveins: $sgpr28, $vgpr1 + ; CHECK: liveins: $sgpr28, $sgpr29, $vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $sgpr28 = frame-setup COPY $sgpr33 ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc + ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr34 + ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31 ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec @@ -193,7 +216,8 @@ body: | ; CHECK-NEXT: $vcc_lo = S_MOV_B32 16384 ; CHECK-NEXT: $vgpr2, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 + ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr29 ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr28 ; CHECK-NEXT: S_ENDPGM 0 S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir index a4104737d974f..162d12f651d4a 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -33,6 +33,8 @@ body: | ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec @@ -40,12 +42,13 @@ body: | ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; MUBUF-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; MUBUF-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4 ; MUBUF-NEXT: S_ENDPGM 0, implicit $vcc ; @@ -60,6 +63,8 @@ body: | ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc @@ -68,12 +73,13 @@ body: | ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -16384, implicit-def $scc + ; FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 ; FLATSCR-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir index 45e95d133e1bb..a4f936a4d705c 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir @@ -32,6 +32,8 @@ body: | ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 786432, implicit-def dead $scc ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc @@ -40,12 +42,13 @@ body: | ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -4096, implicit-def $scc ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -786432, implicit-def dead $scc ; CHECK-NEXT: $sgpr33 = COPY $sgpr4 ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir index 9462d01ba758d..63a4759d8e740 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -34,6 +34,8 @@ body: | ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; GFX8-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; GFX8-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8-NEXT: $vcc_lo = S_MOV_B32 8192 @@ -42,12 +44,13 @@ body: | ; GFX8-NEXT: $vcc_lo = S_MOV_B32 16384 ; GFX8-NEXT: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec ; GFX8-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec + ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX8-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; GFX8-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; GFX8-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX8-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc ; GFX8-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; GFX8-NEXT: $sgpr33 = COPY $sgpr4 ; GFX8-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -62,18 +65,21 @@ body: | ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; GFX9-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; GFX9-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; GFX9-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX9-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc ; GFX9-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX9-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec ; GFX9-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX9-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec ; GFX9-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec + ; GFX9-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX9-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; GFX9-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; GFX9-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX9-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc ; GFX9-NEXT: $sgpr33 = COPY $sgpr4 ; GFX9-NEXT: S_ENDPGM 0, amdgpu_allvgprs ; @@ -88,17 +94,20 @@ body: | ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; GFX9-FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; GFX9-FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; GFX9-FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc ; GFX9-FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $sgpr4, $vgpr1, implicit $exec + ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX9-FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; GFX9-FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; GFX9-FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GFX9-FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc ; GFX9-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc ; GFX9-FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 ; GFX9-FLATSCR-NEXT: S_ENDPGM 0, amdgpu_allvgprs $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 1126db9cae93f..20e5af1b87f1f 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -340,6 +340,7 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload ; GFX906-NEXT: v_readlane_b32 s31, v41, 1 ; GFX906-NEXT: v_readlane_b32 s30, v41, 0 +; GFX906-NEXT: s_mov_b32 s32, s33 ; GFX906-NEXT: v_readlane_b32 s4, v41, 4 ; GFX906-NEXT: v_readlane_b32 s34, v41, 2 ; GFX906-NEXT: v_readlane_b32 s35, v41, 3 @@ -366,7 +367,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_mov_b64 exec, -1 ; GFX906-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload ; GFX906-NEXT: s_mov_b64 exec, s[6:7] -; GFX906-NEXT: s_addk_i32 s32, 0xd800 ; GFX906-NEXT: s_mov_b32 s33, s4 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -752,8 +752,9 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, s[4:5] -; GFX908-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX908-NEXT: s_mov_b32 s32, s33 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s4, v0 ; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload @@ -765,7 +766,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_addk_i32 s32, 0xd400 ; GFX908-NEXT: s_mov_b32 s33, s4 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index 648f4fc64f9d0..64a8f5484673f 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -20,8 +20,8 @@ ; CHECK-LABEL: {{^}}call_72xi32: -; GFX11-PAL: NumSgprs: 35 -; GFX11-PAL-GCNTRACKERS: NumSgprs: 35 +; GFX11-PAL: NumSgprs: 37 +; GFX11-PAL-GCNTRACKERS: NumSgprs: 37 ; GFX11-PAL: NumVgprs: 64 ; GFX11-PAL-GCNTRACKERS: NumVgprs: 64 ; GFX11-PAL: ScratchSize: 2780 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir index 94cbe568a6a44..925984b15367d 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir @@ -198,6 +198,7 @@ body: | ; GCN-NEXT: $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2 ; GCN-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; GCN-NEXT: $sgpr32 = frame-destroy COPY $sgpr33 ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5) ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5) @@ -205,7 +206,6 @@ body: | ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5) ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 - ; GCN-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc ; GCN-NEXT: $sgpr33 = frame-destroy COPY $vcc_hi ; GCN-NEXT: S_ENDPGM 0 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index ff2202f1e177b..4a01962aa4084 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -265,10 +265,10 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -557,10 +557,10 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:428 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1788,7 +1788,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 5536a09538e6e..0676bc79a46f5 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -226,11 +226,11 @@ entry: ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 ; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1 ; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 70bd63d31d5d7..fed60eecc8a8b 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -42,7 +42,7 @@ define void @needs_align16_default_stack_align(i32 %idx) #0 { ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0xd800 +; GCN: s_mov_b32 s32, s34 ; GCN: ; ScratchSize: 160 define void @needs_align16_stack_align4(i32 %idx) #2 { @@ -63,7 +63,7 @@ define void @needs_align16_stack_align4(i32 %idx) #2 { ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0xd000 +; GCN: s_mov_b32 s32, s34 ; GCN: ; ScratchSize: 192 define void @needs_align32(i32 %idx) #0 { @@ -79,7 +79,7 @@ define void @needs_align32(i32 %idx) #0 { ; GCN: s_addk_i32 s32, 0xd00{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0xf300 +; GCN: s_mov_b32 s32, s34 ; GCN: ; ScratchSize: 52 define void @force_realign4(i32 %idx) #1 { @@ -127,10 +127,12 @@ define amdgpu_kernel void @kernel_call_align4_from_5() { ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_addk_i32 s32, 0x4000 ; GCN-NOT: s33 ; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}} -; GCN: s_addk_i32 s32, 0xc000 +; GCN: s_mov_b32 s32, s34 ; GCN: s_mov_b32 s33, [[FP_COPY]] define void @default_realign_align128(i32 %idx) #0 { %alloca.align = alloca i32, align 128, addrspace(5) @@ -175,12 +177,12 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 +; GCN-NEXT: s_mov_b32 s32, s34 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2 ; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_add_i32 s32, s32, 0xfffd0000 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 s[30:31] %temp = alloca i32, align 1024, addrspace(5) @@ -209,8 +211,8 @@ define i32 @needs_align1024_stack_args_used_inside_loop(ptr addrspace(5) nocaptu ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024 ; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen ; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]] -; GCN: s_mov_b32 s34, [[BP_COPY]] -; GCN-NEXT: s_add_i32 s32, s32, 0xfffd0000 +; GCN: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]] ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_setpc_b64 s[30:31] begin: diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index da99052ba69ba..8f16fcf6d0890 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -1282,16 +1282,16 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE32-OPT-NEXT: s_mov_b32 s32, s18 ; WAVE32-OPT-NEXT: ;;#ASMSTART ; WAVE32-OPT-NEXT: ; use s19 ; WAVE32-OPT-NEXT: ;;#ASMEND -; WAVE32-OPT-NEXT: s_mov_b32 s32, s18 ; WAVE32-OPT-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE32-OPT-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE32-OPT-NEXT: s_mov_b32 s32, s33 ; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-OPT-NEXT: s_addk_i32 s32, 0xee00 ; WAVE32-OPT-NEXT: s_mov_b32 s33, s20 ; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) ; WAVE32-OPT-NEXT: s_setpc_b64 s[30:31] @@ -1317,16 +1317,16 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE64-OPT-NEXT: s_mov_b32 s32, s18 ; WAVE64-OPT-NEXT: ;;#ASMSTART ; WAVE64-OPT-NEXT: ; use s19 ; WAVE64-OPT-NEXT: ;;#ASMEND -; WAVE64-OPT-NEXT: s_mov_b32 s32, s18 ; WAVE64-OPT-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE64-OPT-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE64-OPT-NEXT: s_mov_b32 s32, s33 ; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE64-OPT-NEXT: s_mov_b64 exec, s[4:5] -; WAVE64-OPT-NEXT: s_addk_i32 s32, 0xdc00 ; WAVE64-OPT-NEXT: s_mov_b32 s33, s20 ; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) ; WAVE64-OPT-NEXT: s_setpc_b64 s[30:31] @@ -1433,11 +1433,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: s_mov_b32 s32, s4 ; WAVE32-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE32-O0-NEXT: s_mov_b32 s32, s33 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-O0-NEXT: s_add_i32 s32, s32, 0xffffee00 ; WAVE32-O0-NEXT: s_mov_b32 s33, s24 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: s_setpc_b64 s[30:31] @@ -1544,11 +1544,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: s_mov_b32 s32, s4 ; WAVE64-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE64-O0-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE64-O0-NEXT: s_mov_b32 s32, s33 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] -; WAVE64-O0-NEXT: s_add_i32 s32, s32, 0xffffdc00 ; WAVE64-O0-NEXT: s_mov_b32 s33, s19 ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE64-O0-NEXT: s_setpc_b64 s[30:31] @@ -1655,11 +1655,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s4 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v33, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s30, v33, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s33 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 -; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0xffffee00 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s24 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) ; WAVE32-WWM-PREALLOC-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll index 3bf7fec81c041..ebd4bc881f2af 100644 --- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll @@ -196,11 +196,11 @@ define void @outgoing_f16_arg(ptr %ptr) #0 { ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -232,11 +232,11 @@ define void @outgoing_v2f16_arg(ptr %ptr) #0 { ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -268,9 +268,10 @@ define void @outgoing_f16_return(ptr %ptr) #0 { ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 -; GFX7-NEXT: v_readlane_b32 s4, v42, 2 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: flat_store_short v[40:41], v0 ; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload @@ -278,7 +279,6 @@ define void @outgoing_f16_return(ptr %ptr) #0 { ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -315,8 +315,9 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 { ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_readlane_b32 s4, v42, 2 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -326,7 +327,6 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 { ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -383,11 +383,11 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 { ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -470,11 +470,11 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 { ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -528,11 +528,11 @@ define half @call_split_type_used_outside_block_v8f16() #0 { ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: s_addk_i32 s32, 0xfc00 ; GFX7-NEXT: s_mov_b32 s33, s4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll index cd6cb4d1e9fe4..242b5e9aeaf42 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll @@ -28,11 +28,11 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -64,11 +64,11 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index b678e3e87202a..d9df80ce6c1c0 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -110,11 +110,11 @@ define hidden void @widget() { ; GCN-NEXT: v_readlane_b32 s31, v41, 1 ; GCN-NEXT: v_readlane_b32 s30, v41, 0 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v41, 16 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -458,11 +458,11 @@ define hidden void @blam() { ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v45, 26 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xf800 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir index 33fb595157256..8a0bf26f81d22 100644 --- a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir +++ b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir @@ -47,6 +47,8 @@ body: | ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5) ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 11010048, implicit-def dead $scc ; MUBUF-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -67,12 +69,13 @@ body: | ; MUBUF-NEXT: bb.2: ; MUBUF-NEXT: liveins: $vgpr2 ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; MUBUF-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; MUBUF-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5) ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -11010048, implicit-def dead $scc ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4 ; MUBUF-NEXT: S_ENDPGM 0 ; @@ -89,6 +92,8 @@ body: | ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.20, addrspace 5) ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2 + ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2 + ; FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 172032, implicit-def dead $scc ; FLATSCR-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc @@ -118,12 +123,13 @@ body: | ; FLATSCR-NEXT: bb.2: ; FLATSCR-NEXT: liveins: $vgpr2 ; FLATSCR-NEXT: {{ $}} + ; FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 + ; FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1 ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.20, addrspace 5) ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7 - ; FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -172032, implicit-def dead $scc ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4 ; FLATSCR-NEXT: S_ENDPGM 0 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll index 2b96e10fd3cc3..6b9476af7a493 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -54,11 +54,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v44, 1 ; GFX9-NEXT: v_readlane_b32 s30, v44, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v44, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -111,12 +111,12 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s4, v44, 2 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -165,11 +165,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v44, 1 ; GFX11-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v44, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -238,11 +238,11 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v45, 1 ; GFX9-NEXT: v_readlane_b32 s30, v45, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v45, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -288,12 +288,12 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; GFX10-NEXT: v_readlane_b32 s31, v45, 1 ; GFX10-NEXT: v_readlane_b32 s30, v45, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s4, v45, 2 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -337,11 +337,11 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; GFX11-NEXT: v_readlane_b32 s31, v45, 1 ; GFX11-NEXT: v_readlane_b32 s30, v45, 0 +; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v45, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 4e17be1ebb312..0307472fce732 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -2866,12 +2866,12 @@ define void @callee_no_stack_with_call() #1 { ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1032-NEXT: v_readlane_b32 s30, v40, 0 +; GFX1032-NEXT: s_mov_b32 s32, s33 ; GFX1032-NEXT: v_readlane_b32 s4, v40, 2 ; GFX1032-NEXT: s_or_saveexec_b32 s5, -1 ; GFX1032-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_mov_b32 exec_lo, s5 -; GFX1032-NEXT: s_addk_i32 s32, 0xfe00 ; GFX1032-NEXT: s_mov_b32 s33, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: s_setpc_b64 s[30:31] @@ -2897,12 +2897,12 @@ define void @callee_no_stack_with_call() #1 { ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1064-NEXT: v_readlane_b32 s30, v40, 0 +; GFX1064-NEXT: s_mov_b32 s32, s33 ; GFX1064-NEXT: v_readlane_b32 s4, v40, 2 ; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064-NEXT: s_addk_i32 s32, 0xfc00 ; GFX1064-NEXT: s_mov_b32 s33, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll index d1ee82e74b3de..cb3a0e1ebb553 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll @@ -47,6 +47,7 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v40, 1 ; GFX90A-NEXT: v_readlane_b32 s30, v40, 0 +; GFX90A-NEXT: s_mov_b32 s32, s33 ; GFX90A-NEXT: v_readlane_b32 s4, v40, 4 ; GFX90A-NEXT: v_readlane_b32 s28, v40, 2 ; GFX90A-NEXT: v_readlane_b32 s29, v40, 3 @@ -56,7 +57,6 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_addk_i32 s32, 0xfc00 ; GFX90A-NEXT: s_mov_b32 s33, s4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll index 4837efe6606b8..766386d84a616 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll @@ -53,6 +53,7 @@ define void @test() #0 { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-NEXT: v_readlane_b32 s28, v40, 2 ; GCN-NEXT: v_readlane_b32 s29, v40, 3 @@ -61,7 +62,6 @@ define void @test() #0 { ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -113,6 +113,7 @@ define void @test() #0 { ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-O0-NEXT: s_mov_b32 s32, s33 ; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2 ; GCN-O0-NEXT: v_readlane_b32 s29, v40, 3 @@ -121,7 +122,6 @@ define void @test() #0 { ; GCN-O0-NEXT: s_mov_b64 exec, -1 ; GCN-O0-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; GCN-O0-NEXT: s_mov_b32 s33, s4 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 1089093ea691c..2e59a36adb7e4 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -390,12 +390,12 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-O0-NEXT: s_mov_b32 s32, s33 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; GFX9-O0-NEXT: s_mov_b32 s33, s48 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -428,12 +428,12 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-O3-NEXT: s_mov_b32 s32, s33 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-O3-NEXT: s_mov_b32 s33, s38 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] @@ -636,6 +636,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 +; GFX9-O0-NEXT: s_mov_b32 s32, s33 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -650,7 +651,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 ; GFX9-O0-NEXT: s_mov_b32 s33, s46 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -698,6 +698,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1 ; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0 +; GFX9-O3-NEXT: s_mov_b32 s32, s33 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -707,7 +708,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-O3-NEXT: s_mov_b32 s33, s38 ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected index d1500e002d7e9..429bee4195fa9 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected @@ -101,7 +101,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_addk_i32 s32, 0xfa00 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_mov_b32 s33, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -138,7 +138,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } ; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16 -; CHECK-NEXT: s_addk_i32 s32, 0xfa00 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected index deadc4adb02c5..842fd8836da7e 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected @@ -42,7 +42,7 @@ define dso_local i32 @check_boundaries() #0 { ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_addk_i32 s32, 0xfa00 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_mov_b32 s33, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -115,7 +115,7 @@ define dso_local i32 @main() #0 { ; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16 -; CHECK-NEXT: s_addk_i32 s32, 0xfa00 +; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31]