diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index 6fb071dd42d2f..3241a76d46a1e 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -1,123 +1,177 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FLATSCR %s -; GCN-LABEL: {{^}}callee_no_stack: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @callee_no_stack() #0 { +; GCN-LABEL: callee_no_stack: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ret void } -; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33 -; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_no_fp_elim_all() #1 { +; MUBUF-LABEL: callee_no_stack_no_fp_elim_all: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_no_stack_no_fp_elim_all: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] ret void } -; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_nonleaf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_no_fp_elim_nonleaf() #2 { +; GCN-LABEL: callee_no_stack_no_fp_elim_nonleaf: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ret void } -; GCN-LABEL: {{^}}callee_with_stack: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @callee_with_stack() #0 { +; MUBUF-LABEL: callee_with_stack: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_with_stack: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca ret void } ; Can use free call clobbered register to preserve original FP value. - -; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33 -; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; MUBUF-NEXT: s_addk_i32 s32, 0x200 -; FLATSCR-NEXT: s_add_i32 s32, s32, 8 -; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33{{$}} -; FLATSCR-NEXT: scratch_store_dword off, v0, s33{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_mov_b32 s32, s33 -; FLATSCR-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_all() #1 { +; MUBUF-LABEL: callee_with_stack_no_fp_elim_all: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_addk_i32 s32, 0x200 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_with_stack_no_fp_elim_all: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_add_i32 s32, s32, 8 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca ret void } -; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_non_leaf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} -; FLATSCR-NEXT: scratch_store_dword off, v0, s32{{$}} -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_non_leaf() #2 { +; MUBUF-LABEL: callee_with_stack_no_fp_elim_non_leaf: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_with_stack_no_fp_elim_non_leaf: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca ret void } -; GCN-LABEL: {{^}}callee_with_stack_and_call: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], 2 -; MUBUF-DAG: s_addk_i32 s32, 0x400{{$}} -; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}} -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, - -; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} -; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33{{$}} - -; GCN: s_swappc_b64 - -; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]] -; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]] - -; MUBUF: s_mov_b32 s32, s33{{$}} -; FLATSCR: s_mov_b32 s32, s33{{$}} -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], 2 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_waitcnt vmcnt(0) - -; GCN-NEXT: s_setpc_b64 s[30:31] define void @callee_with_stack_and_call() #0 { +; MUBUF-LABEL: callee_with_stack_and_call: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s16, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[18:19], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[18:19] +; MUBUF-NEXT: v_writelane_b32 v40, s16, 2 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_getpc_b64 s[16:17] +; MUBUF-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_with_stack_and_call: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void @external_void_func_void() @@ -130,36 +184,60 @@ define void @callee_with_stack_and_call() #0 { ; There is stack usage only because of the need to evict a VGPR for ; spilling CSR SGPRs. -; GCN-LABEL: {{^}}callee_no_stack_with_call: -; GCN: s_waitcnt -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; MUBUF-DAG: s_addk_i32 s32, 0x400 -; FLATSCR-DAG: s_add_i32 s32, s32, 16 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], [[FP_SPILL_LANE:[0-9]+]] - -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 -; GCN: s_swappc_b64 - -; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0 -; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1 - -; MUBUF: s_mov_b32 s32, s33 -; FLATSCR: s_mov_b32 s32, s33 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], [[FP_SPILL_LANE]] -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] define void @callee_no_stack_with_call() #0 { +; MUBUF-LABEL: callee_no_stack_with_call: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s16, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[18:19], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[18:19] +; MUBUF-NEXT: v_writelane_b32 v40, s16, 2 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: s_getpc_b64 s[16:17] +; MUBUF-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_no_stack_with_call: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_void() ret void } @@ -168,26 +246,306 @@ declare hidden void @external_void_func_void() #0 ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and ; restored. No FP is required. -; -; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: -; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR]], s -; GCN: v_writelane_b32 [[CSR_VGPR]], s - -; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] -; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] - -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { +; MUBUF-LABEL: callee_func_sgpr_spill_no_calls: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v40, s36, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s37, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s38, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s39, 3 +; MUBUF-NEXT: v_writelane_b32 v40, s40, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s41, 5 +; MUBUF-NEXT: v_writelane_b32 v40, s42, 6 +; MUBUF-NEXT: v_writelane_b32 v40, s43, 7 +; MUBUF-NEXT: v_writelane_b32 v40, s44, 8 +; MUBUF-NEXT: v_writelane_b32 v40, s45, 9 +; MUBUF-NEXT: v_writelane_b32 v40, s46, 10 +; MUBUF-NEXT: v_writelane_b32 v40, s47, 11 +; MUBUF-NEXT: v_writelane_b32 v40, s48, 12 +; MUBUF-NEXT: v_writelane_b32 v40, s49, 13 +; MUBUF-NEXT: v_writelane_b32 v40, s50, 14 +; MUBUF-NEXT: v_writelane_b32 v40, s51, 15 +; MUBUF-NEXT: v_writelane_b32 v40, s52, 16 +; MUBUF-NEXT: v_writelane_b32 v40, s53, 17 +; MUBUF-NEXT: v_writelane_b32 v40, s54, 18 +; MUBUF-NEXT: v_writelane_b32 v40, s55, 19 +; MUBUF-NEXT: v_writelane_b32 v40, s56, 20 +; MUBUF-NEXT: v_writelane_b32 v40, s57, 21 +; MUBUF-NEXT: v_writelane_b32 v40, s58, 22 +; MUBUF-NEXT: v_writelane_b32 v40, s59, 23 +; MUBUF-NEXT: v_writelane_b32 v40, s60, 24 +; MUBUF-NEXT: v_writelane_b32 v40, s61, 25 +; MUBUF-NEXT: v_writelane_b32 v40, s62, 26 +; MUBUF-NEXT: v_writelane_b32 v40, s63, 27 +; MUBUF-NEXT: v_writelane_b32 v40, s64, 28 +; MUBUF-NEXT: v_writelane_b32 v40, s65, 29 +; MUBUF-NEXT: v_writelane_b32 v40, s66, 30 +; MUBUF-NEXT: v_writelane_b32 v40, s67, 31 +; MUBUF-NEXT: v_writelane_b32 v40, s68, 32 +; MUBUF-NEXT: v_writelane_b32 v40, s69, 33 +; MUBUF-NEXT: v_writelane_b32 v40, s70, 34 +; MUBUF-NEXT: v_writelane_b32 v40, s71, 35 +; MUBUF-NEXT: v_writelane_b32 v40, s72, 36 +; MUBUF-NEXT: v_writelane_b32 v40, s73, 37 +; MUBUF-NEXT: v_writelane_b32 v40, s74, 38 +; MUBUF-NEXT: v_writelane_b32 v40, s75, 39 +; MUBUF-NEXT: v_writelane_b32 v40, s76, 40 +; MUBUF-NEXT: v_writelane_b32 v40, s77, 41 +; MUBUF-NEXT: v_writelane_b32 v40, s78, 42 +; MUBUF-NEXT: v_writelane_b32 v40, s79, 43 +; MUBUF-NEXT: v_writelane_b32 v40, s80, 44 +; MUBUF-NEXT: v_writelane_b32 v40, s81, 45 +; MUBUF-NEXT: v_writelane_b32 v40, s82, 46 +; MUBUF-NEXT: v_writelane_b32 v40, s83, 47 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[68:83] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[52:67] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[36:51] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[4:19] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[20:27] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[28:29] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[68:83] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[52:67] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[36:51] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[20:27] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[28:29] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[4:19] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s83, v40, 47 +; MUBUF-NEXT: v_readlane_b32 s82, v40, 46 +; MUBUF-NEXT: v_readlane_b32 s81, v40, 45 +; MUBUF-NEXT: v_readlane_b32 s80, v40, 44 +; MUBUF-NEXT: v_readlane_b32 s79, v40, 43 +; MUBUF-NEXT: v_readlane_b32 s78, v40, 42 +; MUBUF-NEXT: v_readlane_b32 s77, v40, 41 +; MUBUF-NEXT: v_readlane_b32 s76, v40, 40 +; MUBUF-NEXT: v_readlane_b32 s75, v40, 39 +; MUBUF-NEXT: v_readlane_b32 s74, v40, 38 +; MUBUF-NEXT: v_readlane_b32 s73, v40, 37 +; MUBUF-NEXT: v_readlane_b32 s72, v40, 36 +; MUBUF-NEXT: v_readlane_b32 s71, v40, 35 +; MUBUF-NEXT: v_readlane_b32 s70, v40, 34 +; MUBUF-NEXT: v_readlane_b32 s69, v40, 33 +; MUBUF-NEXT: v_readlane_b32 s68, v40, 32 +; MUBUF-NEXT: v_readlane_b32 s67, v40, 31 +; MUBUF-NEXT: v_readlane_b32 s66, v40, 30 +; MUBUF-NEXT: v_readlane_b32 s65, v40, 29 +; MUBUF-NEXT: v_readlane_b32 s64, v40, 28 +; MUBUF-NEXT: v_readlane_b32 s63, v40, 27 +; MUBUF-NEXT: v_readlane_b32 s62, v40, 26 +; MUBUF-NEXT: v_readlane_b32 s61, v40, 25 +; MUBUF-NEXT: v_readlane_b32 s60, v40, 24 +; MUBUF-NEXT: v_readlane_b32 s59, v40, 23 +; MUBUF-NEXT: v_readlane_b32 s58, v40, 22 +; MUBUF-NEXT: v_readlane_b32 s57, v40, 21 +; MUBUF-NEXT: v_readlane_b32 s56, v40, 20 +; MUBUF-NEXT: v_readlane_b32 s55, v40, 19 +; MUBUF-NEXT: v_readlane_b32 s54, v40, 18 +; MUBUF-NEXT: v_readlane_b32 s53, v40, 17 +; MUBUF-NEXT: v_readlane_b32 s52, v40, 16 +; MUBUF-NEXT: v_readlane_b32 s51, v40, 15 +; MUBUF-NEXT: v_readlane_b32 s50, v40, 14 +; MUBUF-NEXT: v_readlane_b32 s49, v40, 13 +; MUBUF-NEXT: v_readlane_b32 s48, v40, 12 +; MUBUF-NEXT: v_readlane_b32 s47, v40, 11 +; MUBUF-NEXT: v_readlane_b32 s46, v40, 10 +; MUBUF-NEXT: v_readlane_b32 s45, v40, 9 +; MUBUF-NEXT: v_readlane_b32 s44, v40, 8 +; MUBUF-NEXT: v_readlane_b32 s43, v40, 7 +; MUBUF-NEXT: v_readlane_b32 s42, v40, 6 +; MUBUF-NEXT: v_readlane_b32 s41, v40, 5 +; MUBUF-NEXT: v_readlane_b32 s40, v40, 4 +; MUBUF-NEXT: v_readlane_b32 s39, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s38, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s37, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s36, v40, 0 +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_func_sgpr_spill_no_calls: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3 +; FLATSCR-NEXT: v_writelane_b32 v40, s36, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s37, 5 +; FLATSCR-NEXT: v_writelane_b32 v40, s38, 6 +; FLATSCR-NEXT: v_writelane_b32 v40, s39, 7 +; FLATSCR-NEXT: v_writelane_b32 v40, s40, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s41, 9 +; FLATSCR-NEXT: v_writelane_b32 v40, s42, 10 +; FLATSCR-NEXT: v_writelane_b32 v40, s43, 11 +; FLATSCR-NEXT: v_writelane_b32 v40, s44, 12 +; FLATSCR-NEXT: v_writelane_b32 v40, s45, 13 +; FLATSCR-NEXT: v_writelane_b32 v40, s46, 14 +; FLATSCR-NEXT: v_writelane_b32 v40, s47, 15 +; FLATSCR-NEXT: v_writelane_b32 v40, s48, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s49, 17 +; FLATSCR-NEXT: v_writelane_b32 v40, s50, 18 +; FLATSCR-NEXT: v_writelane_b32 v40, s51, 19 +; FLATSCR-NEXT: v_writelane_b32 v40, s52, 20 +; FLATSCR-NEXT: v_writelane_b32 v40, s53, 21 +; FLATSCR-NEXT: v_writelane_b32 v40, s54, 22 +; FLATSCR-NEXT: v_writelane_b32 v40, s55, 23 +; FLATSCR-NEXT: v_writelane_b32 v40, s56, 24 +; FLATSCR-NEXT: v_writelane_b32 v40, s57, 25 +; FLATSCR-NEXT: v_writelane_b32 v40, s58, 26 +; FLATSCR-NEXT: v_writelane_b32 v40, s59, 27 +; FLATSCR-NEXT: v_writelane_b32 v40, s60, 28 +; FLATSCR-NEXT: v_writelane_b32 v40, s61, 29 +; FLATSCR-NEXT: v_writelane_b32 v40, s62, 30 +; FLATSCR-NEXT: v_writelane_b32 v40, s63, 31 +; FLATSCR-NEXT: v_writelane_b32 v40, s64, 32 +; FLATSCR-NEXT: v_writelane_b32 v40, s65, 33 +; FLATSCR-NEXT: v_writelane_b32 v40, s66, 34 +; FLATSCR-NEXT: v_writelane_b32 v40, s67, 35 +; FLATSCR-NEXT: v_writelane_b32 v40, s68, 36 +; FLATSCR-NEXT: v_writelane_b32 v40, s69, 37 +; FLATSCR-NEXT: v_writelane_b32 v40, s70, 38 +; FLATSCR-NEXT: v_writelane_b32 v40, s71, 39 +; FLATSCR-NEXT: v_writelane_b32 v40, s72, 40 +; FLATSCR-NEXT: v_writelane_b32 v40, s73, 41 +; FLATSCR-NEXT: v_writelane_b32 v40, s74, 42 +; FLATSCR-NEXT: v_writelane_b32 v40, s75, 43 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[52:67] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[36:51] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[16:31] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[0:15] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[68:75] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[34:35] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[52:67] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[36:51] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[16:31] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[68:75] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[34:35] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[0:15] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s75, v40, 43 +; FLATSCR-NEXT: v_readlane_b32 s74, v40, 42 +; FLATSCR-NEXT: v_readlane_b32 s73, v40, 41 +; FLATSCR-NEXT: v_readlane_b32 s72, v40, 40 +; FLATSCR-NEXT: v_readlane_b32 s71, v40, 39 +; FLATSCR-NEXT: v_readlane_b32 s70, v40, 38 +; FLATSCR-NEXT: v_readlane_b32 s69, v40, 37 +; FLATSCR-NEXT: v_readlane_b32 s68, v40, 36 +; FLATSCR-NEXT: v_readlane_b32 s67, v40, 35 +; FLATSCR-NEXT: v_readlane_b32 s66, v40, 34 +; FLATSCR-NEXT: v_readlane_b32 s65, v40, 33 +; FLATSCR-NEXT: v_readlane_b32 s64, v40, 32 +; FLATSCR-NEXT: v_readlane_b32 s63, v40, 31 +; FLATSCR-NEXT: v_readlane_b32 s62, v40, 30 +; FLATSCR-NEXT: v_readlane_b32 s61, v40, 29 +; FLATSCR-NEXT: v_readlane_b32 s60, v40, 28 +; FLATSCR-NEXT: v_readlane_b32 s59, v40, 27 +; FLATSCR-NEXT: v_readlane_b32 s58, v40, 26 +; FLATSCR-NEXT: v_readlane_b32 s57, v40, 25 +; FLATSCR-NEXT: v_readlane_b32 s56, v40, 24 +; FLATSCR-NEXT: v_readlane_b32 s55, v40, 23 +; FLATSCR-NEXT: v_readlane_b32 s54, v40, 22 +; FLATSCR-NEXT: v_readlane_b32 s53, v40, 21 +; FLATSCR-NEXT: v_readlane_b32 s52, v40, 20 +; FLATSCR-NEXT: v_readlane_b32 s51, v40, 19 +; FLATSCR-NEXT: v_readlane_b32 s50, v40, 18 +; FLATSCR-NEXT: v_readlane_b32 s49, v40, 17 +; FLATSCR-NEXT: v_readlane_b32 s48, v40, 16 +; FLATSCR-NEXT: v_readlane_b32 s47, v40, 15 +; FLATSCR-NEXT: v_readlane_b32 s46, v40, 14 +; FLATSCR-NEXT: v_readlane_b32 s45, v40, 13 +; FLATSCR-NEXT: v_readlane_b32 s44, v40, 12 +; FLATSCR-NEXT: v_readlane_b32 s43, v40, 11 +; FLATSCR-NEXT: v_readlane_b32 s42, v40, 10 +; FLATSCR-NEXT: v_readlane_b32 s41, v40, 9 +; FLATSCR-NEXT: v_readlane_b32 s40, v40, 8 +; FLATSCR-NEXT: v_readlane_b32 s39, v40, 7 +; FLATSCR-NEXT: v_readlane_b32 s38, v40, 6 +; FLATSCR-NEXT: v_readlane_b32 s37, v40, 5 +; FLATSCR-NEXT: v_readlane_b32 s36, v40, 4 +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0 call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0 @@ -212,55 +570,83 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; Has no spilled CSR VGPRs used for SGPR spilling, so no need to ; enable all lanes and restore. - -; GCN-LABEL: {{^}}spill_only_csr_sgpr: -; GCN: s_waitcnt -; GCN-NEXT: s_xor_saveexec_b64 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, -; GCN-NEXT: v_writelane_b32 v0, s42, 0 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; clobber s42 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s42, v0, 0 -; GCN-NEXT: s_xor_saveexec_b64 -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @spill_only_csr_sgpr() { +; MUBUF-LABEL: spill_only_csr_sgpr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s42, 0 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber s42 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s42, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: spill_only_csr_sgpr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s42, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber s42 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s42, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber s42", "~{s42}"() ret void } ; TODO: Can the SP inc/deec be remvoed? -; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_csr_vgpr: -; GCN: s_waitcnt -; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; MUBUF-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; FLATSCR-DAG: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 -; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:4 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; clobber v41 -; GCN-NEXT: ;;#ASMEND - -; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload -; MUBUF: s_addk_i32 s32, 0x300 -; MUBUF-NEXT: s_mov_b32 s32, s33 -; MUBUF-NEXT: s_mov_b32 s33, s4 -; FLATSCR: s_add_i32 s32, s32, 12 -; FLATSCR-NEXT: s_mov_b32 s32, s33 -; FLATSCR-NEXT: s_mov_b32 s33, s0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { +; MUBUF-LABEL: callee_with_stack_no_fp_elim_csr_vgpr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber v41 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_with_stack_no_fp_elim_csr_vgpr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber v41 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_add_i32 s32, s32, 12 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void asm sideeffect "; clobber v41", "~{v41}"() @@ -268,32 +654,312 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { } ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. -; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr: -; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 -; GCN: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; GCN: v_writelane_b32 v1 -; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4 -; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4 -; GCN: ;;#ASMSTART - -; MUBUF: s_mov_b32 s32, s33 -; FLATSCR: s_mov_b32 s32, s33 - -; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, [[TMP_SGPR]] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @last_lane_vgpr_for_fp_csr() #1 { +; MUBUF-LABEL: last_lane_vgpr_for_fp_csr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v1, s40, 0 +; MUBUF-NEXT: v_writelane_b32 v1, s41, 1 +; MUBUF-NEXT: v_writelane_b32 v1, s42, 2 +; MUBUF-NEXT: v_writelane_b32 v1, s43, 3 +; MUBUF-NEXT: v_writelane_b32 v1, s44, 4 +; MUBUF-NEXT: v_writelane_b32 v1, s45, 5 +; MUBUF-NEXT: v_writelane_b32 v1, s46, 6 +; MUBUF-NEXT: v_writelane_b32 v1, s47, 7 +; MUBUF-NEXT: v_writelane_b32 v1, s48, 8 +; MUBUF-NEXT: v_writelane_b32 v1, s49, 9 +; MUBUF-NEXT: v_writelane_b32 v1, s50, 10 +; MUBUF-NEXT: v_writelane_b32 v1, s51, 11 +; MUBUF-NEXT: v_writelane_b32 v1, s52, 12 +; MUBUF-NEXT: v_writelane_b32 v1, s53, 13 +; MUBUF-NEXT: v_writelane_b32 v1, s54, 14 +; MUBUF-NEXT: v_writelane_b32 v1, s55, 15 +; MUBUF-NEXT: v_writelane_b32 v1, s56, 16 +; MUBUF-NEXT: v_writelane_b32 v1, s57, 17 +; MUBUF-NEXT: v_writelane_b32 v1, s58, 18 +; MUBUF-NEXT: v_writelane_b32 v1, s59, 19 +; MUBUF-NEXT: v_writelane_b32 v1, s60, 20 +; MUBUF-NEXT: v_writelane_b32 v1, s61, 21 +; MUBUF-NEXT: v_writelane_b32 v1, s62, 22 +; MUBUF-NEXT: v_writelane_b32 v1, s63, 23 +; MUBUF-NEXT: v_writelane_b32 v1, s64, 24 +; MUBUF-NEXT: v_writelane_b32 v1, s65, 25 +; MUBUF-NEXT: v_writelane_b32 v1, s66, 26 +; MUBUF-NEXT: v_writelane_b32 v1, s67, 27 +; MUBUF-NEXT: v_writelane_b32 v1, s68, 28 +; MUBUF-NEXT: v_writelane_b32 v1, s69, 29 +; MUBUF-NEXT: v_writelane_b32 v1, s70, 30 +; MUBUF-NEXT: v_writelane_b32 v1, s71, 31 +; MUBUF-NEXT: v_writelane_b32 v1, s72, 32 +; MUBUF-NEXT: v_writelane_b32 v1, s73, 33 +; MUBUF-NEXT: v_writelane_b32 v1, s74, 34 +; MUBUF-NEXT: v_writelane_b32 v1, s75, 35 +; MUBUF-NEXT: v_writelane_b32 v1, s76, 36 +; MUBUF-NEXT: v_writelane_b32 v1, s77, 37 +; MUBUF-NEXT: v_writelane_b32 v1, s78, 38 +; MUBUF-NEXT: v_writelane_b32 v1, s79, 39 +; MUBUF-NEXT: v_writelane_b32 v1, s80, 40 +; MUBUF-NEXT: v_writelane_b32 v1, s81, 41 +; MUBUF-NEXT: v_writelane_b32 v1, s82, 42 +; MUBUF-NEXT: v_writelane_b32 v1, s83, 43 +; MUBUF-NEXT: v_writelane_b32 v1, s84, 44 +; MUBUF-NEXT: v_writelane_b32 v1, s85, 45 +; MUBUF-NEXT: v_writelane_b32 v1, s86, 46 +; MUBUF-NEXT: v_writelane_b32 v1, s87, 47 +; MUBUF-NEXT: v_writelane_b32 v1, s88, 48 +; MUBUF-NEXT: v_writelane_b32 v1, s89, 49 +; MUBUF-NEXT: v_writelane_b32 v1, s90, 50 +; MUBUF-NEXT: v_writelane_b32 v1, s91, 51 +; MUBUF-NEXT: v_writelane_b32 v1, s92, 52 +; MUBUF-NEXT: v_writelane_b32 v1, s93, 53 +; MUBUF-NEXT: v_writelane_b32 v1, s94, 54 +; MUBUF-NEXT: v_writelane_b32 v1, s95, 55 +; MUBUF-NEXT: v_writelane_b32 v1, s96, 56 +; MUBUF-NEXT: v_writelane_b32 v1, s97, 57 +; MUBUF-NEXT: v_writelane_b32 v1, s98, 58 +; MUBUF-NEXT: v_writelane_b32 v1, s99, 59 +; MUBUF-NEXT: v_writelane_b32 v1, s100, 60 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v1, s101, 61 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber v41 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_writelane_b32 v1, s102, 62 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_readlane_b32 s102, v1, 62 +; MUBUF-NEXT: v_readlane_b32 s101, v1, 61 +; MUBUF-NEXT: v_readlane_b32 s100, v1, 60 +; MUBUF-NEXT: v_readlane_b32 s99, v1, 59 +; MUBUF-NEXT: v_readlane_b32 s98, v1, 58 +; MUBUF-NEXT: v_readlane_b32 s97, v1, 57 +; MUBUF-NEXT: v_readlane_b32 s96, v1, 56 +; MUBUF-NEXT: v_readlane_b32 s95, v1, 55 +; MUBUF-NEXT: v_readlane_b32 s94, v1, 54 +; MUBUF-NEXT: v_readlane_b32 s93, v1, 53 +; MUBUF-NEXT: v_readlane_b32 s92, v1, 52 +; MUBUF-NEXT: v_readlane_b32 s91, v1, 51 +; MUBUF-NEXT: v_readlane_b32 s90, v1, 50 +; MUBUF-NEXT: v_readlane_b32 s89, v1, 49 +; MUBUF-NEXT: v_readlane_b32 s88, v1, 48 +; MUBUF-NEXT: v_readlane_b32 s87, v1, 47 +; MUBUF-NEXT: v_readlane_b32 s86, v1, 46 +; MUBUF-NEXT: v_readlane_b32 s85, v1, 45 +; MUBUF-NEXT: v_readlane_b32 s84, v1, 44 +; MUBUF-NEXT: v_readlane_b32 s83, v1, 43 +; MUBUF-NEXT: v_readlane_b32 s82, v1, 42 +; MUBUF-NEXT: v_readlane_b32 s81, v1, 41 +; MUBUF-NEXT: v_readlane_b32 s80, v1, 40 +; MUBUF-NEXT: v_readlane_b32 s79, v1, 39 +; MUBUF-NEXT: v_readlane_b32 s78, v1, 38 +; MUBUF-NEXT: v_readlane_b32 s77, v1, 37 +; MUBUF-NEXT: v_readlane_b32 s76, v1, 36 +; MUBUF-NEXT: v_readlane_b32 s75, v1, 35 +; MUBUF-NEXT: v_readlane_b32 s74, v1, 34 +; MUBUF-NEXT: v_readlane_b32 s73, v1, 33 +; MUBUF-NEXT: v_readlane_b32 s72, v1, 32 +; MUBUF-NEXT: v_readlane_b32 s71, v1, 31 +; MUBUF-NEXT: v_readlane_b32 s70, v1, 30 +; MUBUF-NEXT: v_readlane_b32 s69, v1, 29 +; MUBUF-NEXT: v_readlane_b32 s68, v1, 28 +; MUBUF-NEXT: v_readlane_b32 s67, v1, 27 +; MUBUF-NEXT: v_readlane_b32 s66, v1, 26 +; MUBUF-NEXT: v_readlane_b32 s65, v1, 25 +; MUBUF-NEXT: v_readlane_b32 s64, v1, 24 +; MUBUF-NEXT: v_readlane_b32 s63, v1, 23 +; MUBUF-NEXT: v_readlane_b32 s62, v1, 22 +; MUBUF-NEXT: v_readlane_b32 s61, v1, 21 +; MUBUF-NEXT: v_readlane_b32 s60, v1, 20 +; MUBUF-NEXT: v_readlane_b32 s59, v1, 19 +; MUBUF-NEXT: v_readlane_b32 s58, v1, 18 +; MUBUF-NEXT: v_readlane_b32 s57, v1, 17 +; MUBUF-NEXT: v_readlane_b32 s56, v1, 16 +; MUBUF-NEXT: v_readlane_b32 s55, v1, 15 +; MUBUF-NEXT: v_readlane_b32 s54, v1, 14 +; MUBUF-NEXT: v_readlane_b32 s53, v1, 13 +; MUBUF-NEXT: v_readlane_b32 s52, v1, 12 +; MUBUF-NEXT: v_readlane_b32 s51, v1, 11 +; MUBUF-NEXT: v_readlane_b32 s50, v1, 10 +; MUBUF-NEXT: v_readlane_b32 s49, v1, 9 +; MUBUF-NEXT: v_readlane_b32 s48, v1, 8 +; MUBUF-NEXT: v_readlane_b32 s47, v1, 7 +; MUBUF-NEXT: v_readlane_b32 s46, v1, 6 +; MUBUF-NEXT: v_readlane_b32 s45, v1, 5 +; MUBUF-NEXT: v_readlane_b32 s44, v1, 4 +; MUBUF-NEXT: v_readlane_b32 s43, v1, 3 +; MUBUF-NEXT: v_readlane_b32 s42, v1, 2 +; MUBUF-NEXT: v_readlane_b32 s41, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s40, v1, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: last_lane_vgpr_for_fp_csr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v1, s40, 0 +; FLATSCR-NEXT: v_writelane_b32 v1, s41, 1 +; FLATSCR-NEXT: v_writelane_b32 v1, s42, 2 +; FLATSCR-NEXT: v_writelane_b32 v1, s43, 3 +; FLATSCR-NEXT: v_writelane_b32 v1, s44, 4 +; FLATSCR-NEXT: v_writelane_b32 v1, s45, 5 +; FLATSCR-NEXT: v_writelane_b32 v1, s46, 6 +; FLATSCR-NEXT: v_writelane_b32 v1, s47, 7 +; FLATSCR-NEXT: v_writelane_b32 v1, s48, 8 +; FLATSCR-NEXT: v_writelane_b32 v1, s49, 9 +; FLATSCR-NEXT: v_writelane_b32 v1, s50, 10 +; FLATSCR-NEXT: v_writelane_b32 v1, s51, 11 +; FLATSCR-NEXT: v_writelane_b32 v1, s52, 12 +; FLATSCR-NEXT: v_writelane_b32 v1, s53, 13 +; FLATSCR-NEXT: v_writelane_b32 v1, s54, 14 +; FLATSCR-NEXT: v_writelane_b32 v1, s55, 15 +; FLATSCR-NEXT: v_writelane_b32 v1, s56, 16 +; FLATSCR-NEXT: v_writelane_b32 v1, s57, 17 +; FLATSCR-NEXT: v_writelane_b32 v1, s58, 18 +; FLATSCR-NEXT: v_writelane_b32 v1, s59, 19 +; FLATSCR-NEXT: v_writelane_b32 v1, s60, 20 +; FLATSCR-NEXT: v_writelane_b32 v1, s61, 21 +; FLATSCR-NEXT: v_writelane_b32 v1, s62, 22 +; FLATSCR-NEXT: v_writelane_b32 v1, s63, 23 +; FLATSCR-NEXT: v_writelane_b32 v1, s64, 24 +; FLATSCR-NEXT: v_writelane_b32 v1, s65, 25 +; FLATSCR-NEXT: v_writelane_b32 v1, s66, 26 +; FLATSCR-NEXT: v_writelane_b32 v1, s67, 27 +; FLATSCR-NEXT: v_writelane_b32 v1, s68, 28 +; FLATSCR-NEXT: v_writelane_b32 v1, s69, 29 +; FLATSCR-NEXT: v_writelane_b32 v1, s70, 30 +; FLATSCR-NEXT: v_writelane_b32 v1, s71, 31 +; FLATSCR-NEXT: v_writelane_b32 v1, s72, 32 +; FLATSCR-NEXT: v_writelane_b32 v1, s73, 33 +; FLATSCR-NEXT: v_writelane_b32 v1, s74, 34 +; FLATSCR-NEXT: v_writelane_b32 v1, s75, 35 +; FLATSCR-NEXT: v_writelane_b32 v1, s76, 36 +; FLATSCR-NEXT: v_writelane_b32 v1, s77, 37 +; FLATSCR-NEXT: v_writelane_b32 v1, s78, 38 +; FLATSCR-NEXT: v_writelane_b32 v1, s79, 39 +; FLATSCR-NEXT: v_writelane_b32 v1, s80, 40 +; FLATSCR-NEXT: v_writelane_b32 v1, s81, 41 +; FLATSCR-NEXT: v_writelane_b32 v1, s82, 42 +; FLATSCR-NEXT: v_writelane_b32 v1, s83, 43 +; FLATSCR-NEXT: v_writelane_b32 v1, s84, 44 +; FLATSCR-NEXT: v_writelane_b32 v1, s85, 45 +; FLATSCR-NEXT: v_writelane_b32 v1, s86, 46 +; FLATSCR-NEXT: v_writelane_b32 v1, s87, 47 +; FLATSCR-NEXT: v_writelane_b32 v1, s88, 48 +; FLATSCR-NEXT: v_writelane_b32 v1, s89, 49 +; FLATSCR-NEXT: v_writelane_b32 v1, s90, 50 +; FLATSCR-NEXT: v_writelane_b32 v1, s91, 51 +; FLATSCR-NEXT: v_writelane_b32 v1, s92, 52 +; FLATSCR-NEXT: v_writelane_b32 v1, s93, 53 +; FLATSCR-NEXT: v_writelane_b32 v1, s94, 54 +; FLATSCR-NEXT: v_writelane_b32 v1, s95, 55 +; FLATSCR-NEXT: v_writelane_b32 v1, s96, 56 +; FLATSCR-NEXT: v_writelane_b32 v1, s97, 57 +; FLATSCR-NEXT: v_writelane_b32 v1, s98, 58 +; FLATSCR-NEXT: v_writelane_b32 v1, s99, 59 +; FLATSCR-NEXT: v_writelane_b32 v1, s100, 60 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: v_writelane_b32 v1, s101, 61 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber v41 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_writelane_b32 v1, s102, 62 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_readlane_b32 s102, v1, 62 +; FLATSCR-NEXT: v_readlane_b32 s101, v1, 61 +; FLATSCR-NEXT: v_readlane_b32 s100, v1, 60 +; FLATSCR-NEXT: v_readlane_b32 s99, v1, 59 +; FLATSCR-NEXT: v_readlane_b32 s98, v1, 58 +; FLATSCR-NEXT: v_readlane_b32 s97, v1, 57 +; FLATSCR-NEXT: v_readlane_b32 s96, v1, 56 +; FLATSCR-NEXT: v_readlane_b32 s95, v1, 55 +; FLATSCR-NEXT: v_readlane_b32 s94, v1, 54 +; FLATSCR-NEXT: v_readlane_b32 s93, v1, 53 +; FLATSCR-NEXT: v_readlane_b32 s92, v1, 52 +; FLATSCR-NEXT: v_readlane_b32 s91, v1, 51 +; FLATSCR-NEXT: v_readlane_b32 s90, v1, 50 +; FLATSCR-NEXT: v_readlane_b32 s89, v1, 49 +; FLATSCR-NEXT: v_readlane_b32 s88, v1, 48 +; FLATSCR-NEXT: v_readlane_b32 s87, v1, 47 +; FLATSCR-NEXT: v_readlane_b32 s86, v1, 46 +; FLATSCR-NEXT: v_readlane_b32 s85, v1, 45 +; FLATSCR-NEXT: v_readlane_b32 s84, v1, 44 +; FLATSCR-NEXT: v_readlane_b32 s83, v1, 43 +; FLATSCR-NEXT: v_readlane_b32 s82, v1, 42 +; FLATSCR-NEXT: v_readlane_b32 s81, v1, 41 +; FLATSCR-NEXT: v_readlane_b32 s80, v1, 40 +; FLATSCR-NEXT: v_readlane_b32 s79, v1, 39 +; FLATSCR-NEXT: v_readlane_b32 s78, v1, 38 +; FLATSCR-NEXT: v_readlane_b32 s77, v1, 37 +; FLATSCR-NEXT: v_readlane_b32 s76, v1, 36 +; FLATSCR-NEXT: v_readlane_b32 s75, v1, 35 +; FLATSCR-NEXT: v_readlane_b32 s74, v1, 34 +; FLATSCR-NEXT: v_readlane_b32 s73, v1, 33 +; FLATSCR-NEXT: v_readlane_b32 s72, v1, 32 +; FLATSCR-NEXT: v_readlane_b32 s71, v1, 31 +; FLATSCR-NEXT: v_readlane_b32 s70, v1, 30 +; FLATSCR-NEXT: v_readlane_b32 s69, v1, 29 +; FLATSCR-NEXT: v_readlane_b32 s68, v1, 28 +; FLATSCR-NEXT: v_readlane_b32 s67, v1, 27 +; FLATSCR-NEXT: v_readlane_b32 s66, v1, 26 +; FLATSCR-NEXT: v_readlane_b32 s65, v1, 25 +; FLATSCR-NEXT: v_readlane_b32 s64, v1, 24 +; FLATSCR-NEXT: v_readlane_b32 s63, v1, 23 +; FLATSCR-NEXT: v_readlane_b32 s62, v1, 22 +; FLATSCR-NEXT: v_readlane_b32 s61, v1, 21 +; FLATSCR-NEXT: v_readlane_b32 s60, v1, 20 +; FLATSCR-NEXT: v_readlane_b32 s59, v1, 19 +; FLATSCR-NEXT: v_readlane_b32 s58, v1, 18 +; FLATSCR-NEXT: v_readlane_b32 s57, v1, 17 +; FLATSCR-NEXT: v_readlane_b32 s56, v1, 16 +; FLATSCR-NEXT: v_readlane_b32 s55, v1, 15 +; FLATSCR-NEXT: v_readlane_b32 s54, v1, 14 +; FLATSCR-NEXT: v_readlane_b32 s53, v1, 13 +; FLATSCR-NEXT: v_readlane_b32 s52, v1, 12 +; FLATSCR-NEXT: v_readlane_b32 s51, v1, 11 +; FLATSCR-NEXT: v_readlane_b32 s50, v1, 10 +; FLATSCR-NEXT: v_readlane_b32 s49, v1, 9 +; FLATSCR-NEXT: v_readlane_b32 s48, v1, 8 +; FLATSCR-NEXT: v_readlane_b32 s47, v1, 7 +; FLATSCR-NEXT: v_readlane_b32 s46, v1, 6 +; FLATSCR-NEXT: v_readlane_b32 s45, v1, 5 +; FLATSCR-NEXT: v_readlane_b32 s44, v1, 4 +; FLATSCR-NEXT: v_readlane_b32 s43, v1, 3 +; FLATSCR-NEXT: v_readlane_b32 s42, v1, 2 +; FLATSCR-NEXT: v_readlane_b32 s41, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s40, v1, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void asm sideeffect "; clobber v41", "~{v41}"() @@ -310,37 +976,316 @@ define void @last_lane_vgpr_for_fp_csr() #1 { } ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. -; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr: -; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-COUNT-61: v_writelane_b32 v1, -; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; GCN: v_writelane_b32 v1, -; MUBUF: buffer_store_dword -; FLATSCR: scratch_store_dword -; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1, -; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload -; MUBUF: s_addk_i32 s32, 0x400 -; FLATSCR: s_add_i32 s32, s32, 16 -; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 -; MUBUF-NEXT: s_mov_b32 s32, s33 -; FLATSCR-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @no_new_vgpr_for_fp_csr() #1 { +; MUBUF-LABEL: no_new_vgpr_for_fp_csr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v1, s39, 0 +; MUBUF-NEXT: v_writelane_b32 v1, s40, 1 +; MUBUF-NEXT: v_writelane_b32 v1, s41, 2 +; MUBUF-NEXT: v_writelane_b32 v1, s42, 3 +; MUBUF-NEXT: v_writelane_b32 v1, s43, 4 +; MUBUF-NEXT: v_writelane_b32 v1, s44, 5 +; MUBUF-NEXT: v_writelane_b32 v1, s45, 6 +; MUBUF-NEXT: v_writelane_b32 v1, s46, 7 +; MUBUF-NEXT: v_writelane_b32 v1, s47, 8 +; MUBUF-NEXT: v_writelane_b32 v1, s48, 9 +; MUBUF-NEXT: v_writelane_b32 v1, s49, 10 +; MUBUF-NEXT: v_writelane_b32 v1, s50, 11 +; MUBUF-NEXT: v_writelane_b32 v1, s51, 12 +; MUBUF-NEXT: v_writelane_b32 v1, s52, 13 +; MUBUF-NEXT: v_writelane_b32 v1, s53, 14 +; MUBUF-NEXT: v_writelane_b32 v1, s54, 15 +; MUBUF-NEXT: v_writelane_b32 v1, s55, 16 +; MUBUF-NEXT: v_writelane_b32 v1, s56, 17 +; MUBUF-NEXT: v_writelane_b32 v1, s57, 18 +; MUBUF-NEXT: v_writelane_b32 v1, s58, 19 +; MUBUF-NEXT: v_writelane_b32 v1, s59, 20 +; MUBUF-NEXT: v_writelane_b32 v1, s60, 21 +; MUBUF-NEXT: v_writelane_b32 v1, s61, 22 +; MUBUF-NEXT: v_writelane_b32 v1, s62, 23 +; MUBUF-NEXT: v_writelane_b32 v1, s63, 24 +; MUBUF-NEXT: v_writelane_b32 v1, s64, 25 +; MUBUF-NEXT: v_writelane_b32 v1, s65, 26 +; MUBUF-NEXT: v_writelane_b32 v1, s66, 27 +; MUBUF-NEXT: v_writelane_b32 v1, s67, 28 +; MUBUF-NEXT: v_writelane_b32 v1, s68, 29 +; MUBUF-NEXT: v_writelane_b32 v1, s69, 30 +; MUBUF-NEXT: v_writelane_b32 v1, s70, 31 +; MUBUF-NEXT: v_writelane_b32 v1, s71, 32 +; MUBUF-NEXT: v_writelane_b32 v1, s72, 33 +; MUBUF-NEXT: v_writelane_b32 v1, s73, 34 +; MUBUF-NEXT: v_writelane_b32 v1, s74, 35 +; MUBUF-NEXT: v_writelane_b32 v1, s75, 36 +; MUBUF-NEXT: v_writelane_b32 v1, s76, 37 +; MUBUF-NEXT: v_writelane_b32 v1, s77, 38 +; MUBUF-NEXT: v_writelane_b32 v1, s78, 39 +; MUBUF-NEXT: v_writelane_b32 v1, s79, 40 +; MUBUF-NEXT: v_writelane_b32 v1, s80, 41 +; MUBUF-NEXT: v_writelane_b32 v1, s81, 42 +; MUBUF-NEXT: v_writelane_b32 v1, s82, 43 +; MUBUF-NEXT: v_writelane_b32 v1, s83, 44 +; MUBUF-NEXT: v_writelane_b32 v1, s84, 45 +; MUBUF-NEXT: v_writelane_b32 v1, s85, 46 +; MUBUF-NEXT: v_writelane_b32 v1, s86, 47 +; MUBUF-NEXT: v_writelane_b32 v1, s87, 48 +; MUBUF-NEXT: v_writelane_b32 v1, s88, 49 +; MUBUF-NEXT: v_writelane_b32 v1, s89, 50 +; MUBUF-NEXT: v_writelane_b32 v1, s90, 51 +; MUBUF-NEXT: v_writelane_b32 v1, s91, 52 +; MUBUF-NEXT: v_writelane_b32 v1, s92, 53 +; MUBUF-NEXT: v_writelane_b32 v1, s93, 54 +; MUBUF-NEXT: v_writelane_b32 v1, s94, 55 +; MUBUF-NEXT: v_writelane_b32 v1, s95, 56 +; MUBUF-NEXT: v_writelane_b32 v1, s96, 57 +; MUBUF-NEXT: v_writelane_b32 v1, s97, 58 +; MUBUF-NEXT: v_writelane_b32 v1, s98, 59 +; MUBUF-NEXT: v_writelane_b32 v1, s99, 60 +; MUBUF-NEXT: v_writelane_b32 v1, s100, 61 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v1, s101, 62 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber v41 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_writelane_b32 v1, s102, 63 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_readlane_b32 s102, v1, 63 +; MUBUF-NEXT: v_readlane_b32 s101, v1, 62 +; MUBUF-NEXT: v_readlane_b32 s100, v1, 61 +; MUBUF-NEXT: v_readlane_b32 s99, v1, 60 +; MUBUF-NEXT: v_readlane_b32 s98, v1, 59 +; MUBUF-NEXT: v_readlane_b32 s97, v1, 58 +; MUBUF-NEXT: v_readlane_b32 s96, v1, 57 +; MUBUF-NEXT: v_readlane_b32 s95, v1, 56 +; MUBUF-NEXT: v_readlane_b32 s94, v1, 55 +; MUBUF-NEXT: v_readlane_b32 s93, v1, 54 +; MUBUF-NEXT: v_readlane_b32 s92, v1, 53 +; MUBUF-NEXT: v_readlane_b32 s91, v1, 52 +; MUBUF-NEXT: v_readlane_b32 s90, v1, 51 +; MUBUF-NEXT: v_readlane_b32 s89, v1, 50 +; MUBUF-NEXT: v_readlane_b32 s88, v1, 49 +; MUBUF-NEXT: v_readlane_b32 s87, v1, 48 +; MUBUF-NEXT: v_readlane_b32 s86, v1, 47 +; MUBUF-NEXT: v_readlane_b32 s85, v1, 46 +; MUBUF-NEXT: v_readlane_b32 s84, v1, 45 +; MUBUF-NEXT: v_readlane_b32 s83, v1, 44 +; MUBUF-NEXT: v_readlane_b32 s82, v1, 43 +; MUBUF-NEXT: v_readlane_b32 s81, v1, 42 +; MUBUF-NEXT: v_readlane_b32 s80, v1, 41 +; MUBUF-NEXT: v_readlane_b32 s79, v1, 40 +; MUBUF-NEXT: v_readlane_b32 s78, v1, 39 +; MUBUF-NEXT: v_readlane_b32 s77, v1, 38 +; MUBUF-NEXT: v_readlane_b32 s76, v1, 37 +; MUBUF-NEXT: v_readlane_b32 s75, v1, 36 +; MUBUF-NEXT: v_readlane_b32 s74, v1, 35 +; MUBUF-NEXT: v_readlane_b32 s73, v1, 34 +; MUBUF-NEXT: v_readlane_b32 s72, v1, 33 +; MUBUF-NEXT: v_readlane_b32 s71, v1, 32 +; MUBUF-NEXT: v_readlane_b32 s70, v1, 31 +; MUBUF-NEXT: v_readlane_b32 s69, v1, 30 +; MUBUF-NEXT: v_readlane_b32 s68, v1, 29 +; MUBUF-NEXT: v_readlane_b32 s67, v1, 28 +; MUBUF-NEXT: v_readlane_b32 s66, v1, 27 +; MUBUF-NEXT: v_readlane_b32 s65, v1, 26 +; MUBUF-NEXT: v_readlane_b32 s64, v1, 25 +; MUBUF-NEXT: v_readlane_b32 s63, v1, 24 +; MUBUF-NEXT: v_readlane_b32 s62, v1, 23 +; MUBUF-NEXT: v_readlane_b32 s61, v1, 22 +; MUBUF-NEXT: v_readlane_b32 s60, v1, 21 +; MUBUF-NEXT: v_readlane_b32 s59, v1, 20 +; MUBUF-NEXT: v_readlane_b32 s58, v1, 19 +; MUBUF-NEXT: v_readlane_b32 s57, v1, 18 +; MUBUF-NEXT: v_readlane_b32 s56, v1, 17 +; MUBUF-NEXT: v_readlane_b32 s55, v1, 16 +; MUBUF-NEXT: v_readlane_b32 s54, v1, 15 +; MUBUF-NEXT: v_readlane_b32 s53, v1, 14 +; MUBUF-NEXT: v_readlane_b32 s52, v1, 13 +; MUBUF-NEXT: v_readlane_b32 s51, v1, 12 +; MUBUF-NEXT: v_readlane_b32 s50, v1, 11 +; MUBUF-NEXT: v_readlane_b32 s49, v1, 10 +; MUBUF-NEXT: v_readlane_b32 s48, v1, 9 +; MUBUF-NEXT: v_readlane_b32 s47, v1, 8 +; MUBUF-NEXT: v_readlane_b32 s46, v1, 7 +; MUBUF-NEXT: v_readlane_b32 s45, v1, 6 +; MUBUF-NEXT: v_readlane_b32 s44, v1, 5 +; MUBUF-NEXT: v_readlane_b32 s43, v1, 4 +; MUBUF-NEXT: v_readlane_b32 s42, v1, 3 +; MUBUF-NEXT: v_readlane_b32 s41, v1, 2 +; MUBUF-NEXT: v_readlane_b32 s40, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s39, v1, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: no_new_vgpr_for_fp_csr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v1, s39, 0 +; FLATSCR-NEXT: v_writelane_b32 v1, s40, 1 +; FLATSCR-NEXT: v_writelane_b32 v1, s41, 2 +; FLATSCR-NEXT: v_writelane_b32 v1, s42, 3 +; FLATSCR-NEXT: v_writelane_b32 v1, s43, 4 +; FLATSCR-NEXT: v_writelane_b32 v1, s44, 5 +; FLATSCR-NEXT: v_writelane_b32 v1, s45, 6 +; FLATSCR-NEXT: v_writelane_b32 v1, s46, 7 +; FLATSCR-NEXT: v_writelane_b32 v1, s47, 8 +; FLATSCR-NEXT: v_writelane_b32 v1, s48, 9 +; FLATSCR-NEXT: v_writelane_b32 v1, s49, 10 +; FLATSCR-NEXT: v_writelane_b32 v1, s50, 11 +; FLATSCR-NEXT: v_writelane_b32 v1, s51, 12 +; FLATSCR-NEXT: v_writelane_b32 v1, s52, 13 +; FLATSCR-NEXT: v_writelane_b32 v1, s53, 14 +; FLATSCR-NEXT: v_writelane_b32 v1, s54, 15 +; FLATSCR-NEXT: v_writelane_b32 v1, s55, 16 +; FLATSCR-NEXT: v_writelane_b32 v1, s56, 17 +; FLATSCR-NEXT: v_writelane_b32 v1, s57, 18 +; FLATSCR-NEXT: v_writelane_b32 v1, s58, 19 +; FLATSCR-NEXT: v_writelane_b32 v1, s59, 20 +; FLATSCR-NEXT: v_writelane_b32 v1, s60, 21 +; FLATSCR-NEXT: v_writelane_b32 v1, s61, 22 +; FLATSCR-NEXT: v_writelane_b32 v1, s62, 23 +; FLATSCR-NEXT: v_writelane_b32 v1, s63, 24 +; FLATSCR-NEXT: v_writelane_b32 v1, s64, 25 +; FLATSCR-NEXT: v_writelane_b32 v1, s65, 26 +; FLATSCR-NEXT: v_writelane_b32 v1, s66, 27 +; FLATSCR-NEXT: v_writelane_b32 v1, s67, 28 +; FLATSCR-NEXT: v_writelane_b32 v1, s68, 29 +; FLATSCR-NEXT: v_writelane_b32 v1, s69, 30 +; FLATSCR-NEXT: v_writelane_b32 v1, s70, 31 +; FLATSCR-NEXT: v_writelane_b32 v1, s71, 32 +; FLATSCR-NEXT: v_writelane_b32 v1, s72, 33 +; FLATSCR-NEXT: v_writelane_b32 v1, s73, 34 +; FLATSCR-NEXT: v_writelane_b32 v1, s74, 35 +; FLATSCR-NEXT: v_writelane_b32 v1, s75, 36 +; FLATSCR-NEXT: v_writelane_b32 v1, s76, 37 +; FLATSCR-NEXT: v_writelane_b32 v1, s77, 38 +; FLATSCR-NEXT: v_writelane_b32 v1, s78, 39 +; FLATSCR-NEXT: v_writelane_b32 v1, s79, 40 +; FLATSCR-NEXT: v_writelane_b32 v1, s80, 41 +; FLATSCR-NEXT: v_writelane_b32 v1, s81, 42 +; FLATSCR-NEXT: v_writelane_b32 v1, s82, 43 +; FLATSCR-NEXT: v_writelane_b32 v1, s83, 44 +; FLATSCR-NEXT: v_writelane_b32 v1, s84, 45 +; FLATSCR-NEXT: v_writelane_b32 v1, s85, 46 +; FLATSCR-NEXT: v_writelane_b32 v1, s86, 47 +; FLATSCR-NEXT: v_writelane_b32 v1, s87, 48 +; FLATSCR-NEXT: v_writelane_b32 v1, s88, 49 +; FLATSCR-NEXT: v_writelane_b32 v1, s89, 50 +; FLATSCR-NEXT: v_writelane_b32 v1, s90, 51 +; FLATSCR-NEXT: v_writelane_b32 v1, s91, 52 +; FLATSCR-NEXT: v_writelane_b32 v1, s92, 53 +; FLATSCR-NEXT: v_writelane_b32 v1, s93, 54 +; FLATSCR-NEXT: v_writelane_b32 v1, s94, 55 +; FLATSCR-NEXT: v_writelane_b32 v1, s95, 56 +; FLATSCR-NEXT: v_writelane_b32 v1, s96, 57 +; FLATSCR-NEXT: v_writelane_b32 v1, s97, 58 +; FLATSCR-NEXT: v_writelane_b32 v1, s98, 59 +; FLATSCR-NEXT: v_writelane_b32 v1, s99, 60 +; FLATSCR-NEXT: v_writelane_b32 v1, s100, 61 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: v_writelane_b32 v1, s101, 62 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber v41 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_writelane_b32 v1, s102, 63 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_readlane_b32 s102, v1, 63 +; FLATSCR-NEXT: v_readlane_b32 s101, v1, 62 +; FLATSCR-NEXT: v_readlane_b32 s100, v1, 61 +; FLATSCR-NEXT: v_readlane_b32 s99, v1, 60 +; FLATSCR-NEXT: v_readlane_b32 s98, v1, 59 +; FLATSCR-NEXT: v_readlane_b32 s97, v1, 58 +; FLATSCR-NEXT: v_readlane_b32 s96, v1, 57 +; FLATSCR-NEXT: v_readlane_b32 s95, v1, 56 +; FLATSCR-NEXT: v_readlane_b32 s94, v1, 55 +; FLATSCR-NEXT: v_readlane_b32 s93, v1, 54 +; FLATSCR-NEXT: v_readlane_b32 s92, v1, 53 +; FLATSCR-NEXT: v_readlane_b32 s91, v1, 52 +; FLATSCR-NEXT: v_readlane_b32 s90, v1, 51 +; FLATSCR-NEXT: v_readlane_b32 s89, v1, 50 +; FLATSCR-NEXT: v_readlane_b32 s88, v1, 49 +; FLATSCR-NEXT: v_readlane_b32 s87, v1, 48 +; FLATSCR-NEXT: v_readlane_b32 s86, v1, 47 +; FLATSCR-NEXT: v_readlane_b32 s85, v1, 46 +; FLATSCR-NEXT: v_readlane_b32 s84, v1, 45 +; FLATSCR-NEXT: v_readlane_b32 s83, v1, 44 +; FLATSCR-NEXT: v_readlane_b32 s82, v1, 43 +; FLATSCR-NEXT: v_readlane_b32 s81, v1, 42 +; FLATSCR-NEXT: v_readlane_b32 s80, v1, 41 +; FLATSCR-NEXT: v_readlane_b32 s79, v1, 40 +; FLATSCR-NEXT: v_readlane_b32 s78, v1, 39 +; FLATSCR-NEXT: v_readlane_b32 s77, v1, 38 +; FLATSCR-NEXT: v_readlane_b32 s76, v1, 37 +; FLATSCR-NEXT: v_readlane_b32 s75, v1, 36 +; FLATSCR-NEXT: v_readlane_b32 s74, v1, 35 +; FLATSCR-NEXT: v_readlane_b32 s73, v1, 34 +; FLATSCR-NEXT: v_readlane_b32 s72, v1, 33 +; FLATSCR-NEXT: v_readlane_b32 s71, v1, 32 +; FLATSCR-NEXT: v_readlane_b32 s70, v1, 31 +; FLATSCR-NEXT: v_readlane_b32 s69, v1, 30 +; FLATSCR-NEXT: v_readlane_b32 s68, v1, 29 +; FLATSCR-NEXT: v_readlane_b32 s67, v1, 28 +; FLATSCR-NEXT: v_readlane_b32 s66, v1, 27 +; FLATSCR-NEXT: v_readlane_b32 s65, v1, 26 +; FLATSCR-NEXT: v_readlane_b32 s64, v1, 25 +; FLATSCR-NEXT: v_readlane_b32 s63, v1, 24 +; FLATSCR-NEXT: v_readlane_b32 s62, v1, 23 +; FLATSCR-NEXT: v_readlane_b32 s61, v1, 22 +; FLATSCR-NEXT: v_readlane_b32 s60, v1, 21 +; FLATSCR-NEXT: v_readlane_b32 s59, v1, 20 +; FLATSCR-NEXT: v_readlane_b32 s58, v1, 19 +; FLATSCR-NEXT: v_readlane_b32 s57, v1, 18 +; FLATSCR-NEXT: v_readlane_b32 s56, v1, 17 +; FLATSCR-NEXT: v_readlane_b32 s55, v1, 16 +; FLATSCR-NEXT: v_readlane_b32 s54, v1, 15 +; FLATSCR-NEXT: v_readlane_b32 s53, v1, 14 +; FLATSCR-NEXT: v_readlane_b32 s52, v1, 13 +; FLATSCR-NEXT: v_readlane_b32 s51, v1, 12 +; FLATSCR-NEXT: v_readlane_b32 s50, v1, 11 +; FLATSCR-NEXT: v_readlane_b32 s49, v1, 10 +; FLATSCR-NEXT: v_readlane_b32 s48, v1, 9 +; FLATSCR-NEXT: v_readlane_b32 s47, v1, 8 +; FLATSCR-NEXT: v_readlane_b32 s46, v1, 7 +; FLATSCR-NEXT: v_readlane_b32 s45, v1, 6 +; FLATSCR-NEXT: v_readlane_b32 s44, v1, 5 +; FLATSCR-NEXT: v_readlane_b32 s43, v1, 4 +; FLATSCR-NEXT: v_readlane_b32 s42, v1, 3 +; FLATSCR-NEXT: v_readlane_b32 s41, v1, 2 +; FLATSCR-NEXT: v_readlane_b32 s40, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s39, v1, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void asm sideeffect "; clobber v41", "~{v41}"() @@ -356,64 +1301,99 @@ define void @no_new_vgpr_for_fp_csr() #1 { ret void } -; GCN-LABEL: {{^}}realign_stack_no_fp_elim: -; GCN: s_waitcnt -; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33 -; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33 -; MUBUF-NEXT: s_add_i32 s33, s32, 0x7ffc0 -; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff -; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 -; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 -; MUBUF-NEXT: s_mov_b32 s5, s34 -; FLATSCR-NEXT: s_mov_b32 s1, s34 -; MUBUF-NEXT: s_mov_b32 s34, s32 -; FLATSCR-NEXT: s_mov_b32 s34, s32 -; MUBUF-NEXT: s_add_i32 s32, s32, 0x180000 -; FLATSCR-NEXT: s_addk_i32 s32, 0x6000 -; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; MUBUF-NEXT: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x2000{{$}} -; MUBUF-NEXT: buffer_store_dword [[ZERO]], [[OFFSET]], s[0:3], s33 offen{{$}} -; FLATSCR-NEXT: s_add_i32 s2, s33, 0x2000 -; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_mov_b32 s32, s34 -; MUBUF-NEXT: s_mov_b32 s34, s5 -; FLATSCR-NEXT: s_mov_b32 s32, s34 -; FLATSCR-NEXT: s_mov_b32 s34, s1 -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_setpc_b64 define void @realign_stack_no_fp_elim() #1 { +; MUBUF-LABEL: realign_stack_no_fp_elim: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_add_i32 s33, s32, 0x7ffc0 +; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 +; MUBUF-NEXT: s_mov_b32 s5, s34 +; MUBUF-NEXT: s_mov_b32 s34, s32 +; MUBUF-NEXT: s_add_i32 s32, s32, 0x180000 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_mov_b32 s32, s34 +; MUBUF-NEXT: s_mov_b32 s34, s5 +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: realign_stack_no_fp_elim: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff +; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 +; FLATSCR-NEXT: s_mov_b32 s1, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s32 +; FLATSCR-NEXT: s_addk_i32 s32, 0x6000 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s2, s33, 0x2000 +; FLATSCR-NEXT: scratch_store_dword off, v0, s2 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s32, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s1 +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 8192, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca ret void } -; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: -; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 vcc_lo, s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0 -; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN: v_writelane_b32 [[CSR_VGPR]], s31, 1 -; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} -; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s31, [[CSR_VGPR]], 1 -; GCN: v_readlane_b32 s30, [[CSR_VGPR]], 0 -;GCN-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, vcc_lo -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] define void @no_unused_non_csr_sgpr_for_fp() #1 { +; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v1, s30, 0 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: v_writelane_b32 v1, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s31, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v1, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v1, s30, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s32, s32, 12 +; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -428,31 +1408,64 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { } ; Need a new CSR VGPR to satisfy the FP spill. -; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: -; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 vcc_lo, s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_mov_b32_e32 - -; MUBUF: s_addk_i32 s32, 0x300{{$}} -; FLATSCR: s_add_i32 s32, s32, 12{{$}} -; MUBUF-DAG: buffer_store_dword -; FLATSCR-DAG: scratch_store_dword - -; GCN: ;;#ASMSTART -; GCN: s_mov_b32 s32, s33 -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, vcc_lo -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { +; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved initial VGPRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s32, s32, 12 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved initial VGPRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -474,32 +1487,72 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; The byval argument exceeds the MUBUF constant offset, so a scratch ; register is needed to access the CSR VGPR slot. -; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset: -; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 vcc_lo, s33 -; GCN-DAG: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 -; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; MUBUF-DAG: s_add_i32 s32, s32, 0x40300{{$}} -; FLATSCR-DAG: s_addk_i32 s32, 0x100c{{$}} -; MUBUF-DAG: buffer_store_dword -; FLATSCR-DAG: scratch_store_dword - -; GCN: ;;#ASMSTART -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload -; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, vcc_lo -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #1 { +; MUBUF-LABEL: scratch_reg_needed_mubuf_offset: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: s_add_i32 s6, s33, 0x40100 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s6 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1000 +; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved SGPRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved VGPRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: s_add_i32 s6, s33, 0x40100 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s6 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: scratch_reg_needed_mubuf_offset: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004 +; FLATSCR-NEXT: scratch_store_dword off, v40, s2 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_addk_i32 s32, 0x100c +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s0, s33, 0x1000 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved SGPRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved VGPRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004 +; FLATSCR-NEXT: scratch_load_dword v40, off, s2 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -520,25 +1573,72 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) ret void } -; GCN-LABEL: {{^}}local_empty_func: -; GCN: s_waitcnt -; GCN-NEXT: s_setpc_b64 define internal void @local_empty_func() #0 { +; GCN-LABEL: local_empty_func: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ret void } ; An FP is needed, despite not needing any spills ; TODO: Ccould see callee does not use stack and omit FP. -; GCN-LABEL: {{^}}ipra_call_with_stack: -; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 -; GCN: s_mov_b32 s33, s32 -; MUBUF: s_addk_i32 s32, 0x400 -; FLATSCR: s_add_i32 s32, s32, 16 -; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}} -; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33{{$}} -; GCN: s_swappc_b64 -; GCN: s_mov_b32 s33, [[TMP_SGPR]] define void @ipra_call_with_stack() #0 { +; MUBUF-LABEL: ipra_call_with_stack: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s18, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[16:17] +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v1, s30, 0 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: v_writelane_b32 v1, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_getpc_b64 s[16:17] +; MUBUF-NEXT: s_add_u32 s16, s16, local_empty_func@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s17, s17, local_empty_func@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] +; MUBUF-NEXT: v_readlane_b32 s31, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v1, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_mov_b32 s33, s18 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: ipra_call_with_stack: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s2, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v1, s30, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, local_empty_func@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, local_empty_func@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_mov_b32 s33, s2 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void @local_empty_func() @@ -546,21 +1646,41 @@ define void @ipra_call_with_stack() #0 { } ; With no free registers, we must spill the FP to memory. -; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory: -; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; FLATSCR: s_mov_b32 s0, s33 -; GCN: s_mov_b32 s33, s32 -; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], [[FP_SCRATCH_COPY]] -; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF: s_waitcnt vmcnt(0) -; MUBUF: v_readfirstlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[TMP_VGPR2]] -; MUBUF: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; FLATSCR: s_mov_b32 s33, s0 -; GCN: s_setpc_b64 -; MUBUF: ScratchSize: 8 -; FLATSCR: ScratchSize: 0 define void @callee_need_to_spill_fp_to_memory() #3 { +; MUBUF-LABEL: callee_need_to_spill_fp_to_memory: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved SGPRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber all VGPRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_addk_i32 s32, 0x200 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_need_to_spill_fp_to_memory: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved SGPRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber all VGPRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber nonpreserved SGPRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} @@ -578,23 +1698,313 @@ define void @callee_need_to_spill_fp_to_memory() #3 { ; If we have a reserved VGPR that can be used for SGPR spills, we may still ; need to spill the FP to memory if there are no free lanes in the reserved ; VGPR. -; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: -; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_mov_b32 s33, s32 -; MUBUF: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], [[FP_SCRATCH_COPY]] -; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s33 offset:[[OFF:[0-9]+]] -; GCN-NOT: v_writelane_b32 v40, s33 -; GCN-NOT: v_readlane_b32 s33, v40 -; GCN-NOT: v_readlane_b32 s33, v40 -; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s33 offset:[[OFF]] -; MUBUF: v_readfirstlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[TMP_VGPR2]] -; MUBUF: s_xor_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]] -; MUBUF: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { +; MUBUF-LABEL: callee_need_to_spill_fp_to_memory_full_reserved_vgpr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v39, s39, 0 +; MUBUF-NEXT: v_writelane_b32 v39, s40, 1 +; MUBUF-NEXT: v_writelane_b32 v39, s41, 2 +; MUBUF-NEXT: v_writelane_b32 v39, s42, 3 +; MUBUF-NEXT: v_writelane_b32 v39, s43, 4 +; MUBUF-NEXT: v_writelane_b32 v39, s44, 5 +; MUBUF-NEXT: v_writelane_b32 v39, s45, 6 +; MUBUF-NEXT: v_writelane_b32 v39, s46, 7 +; MUBUF-NEXT: v_writelane_b32 v39, s47, 8 +; MUBUF-NEXT: v_writelane_b32 v39, s48, 9 +; MUBUF-NEXT: v_writelane_b32 v39, s49, 10 +; MUBUF-NEXT: v_writelane_b32 v39, s50, 11 +; MUBUF-NEXT: v_writelane_b32 v39, s51, 12 +; MUBUF-NEXT: v_writelane_b32 v39, s52, 13 +; MUBUF-NEXT: v_writelane_b32 v39, s53, 14 +; MUBUF-NEXT: v_writelane_b32 v39, s54, 15 +; MUBUF-NEXT: v_writelane_b32 v39, s55, 16 +; MUBUF-NEXT: v_writelane_b32 v39, s56, 17 +; MUBUF-NEXT: v_writelane_b32 v39, s57, 18 +; MUBUF-NEXT: v_writelane_b32 v39, s58, 19 +; MUBUF-NEXT: v_writelane_b32 v39, s59, 20 +; MUBUF-NEXT: v_writelane_b32 v39, s60, 21 +; MUBUF-NEXT: v_writelane_b32 v39, s61, 22 +; MUBUF-NEXT: v_writelane_b32 v39, s62, 23 +; MUBUF-NEXT: v_writelane_b32 v39, s63, 24 +; MUBUF-NEXT: v_writelane_b32 v39, s64, 25 +; MUBUF-NEXT: v_writelane_b32 v39, s65, 26 +; MUBUF-NEXT: v_writelane_b32 v39, s66, 27 +; MUBUF-NEXT: v_writelane_b32 v39, s67, 28 +; MUBUF-NEXT: v_writelane_b32 v39, s68, 29 +; MUBUF-NEXT: v_writelane_b32 v39, s69, 30 +; MUBUF-NEXT: v_writelane_b32 v39, s70, 31 +; MUBUF-NEXT: v_writelane_b32 v39, s71, 32 +; MUBUF-NEXT: v_writelane_b32 v39, s72, 33 +; MUBUF-NEXT: v_writelane_b32 v39, s73, 34 +; MUBUF-NEXT: v_writelane_b32 v39, s74, 35 +; MUBUF-NEXT: v_writelane_b32 v39, s75, 36 +; MUBUF-NEXT: v_writelane_b32 v39, s76, 37 +; MUBUF-NEXT: v_writelane_b32 v39, s77, 38 +; MUBUF-NEXT: v_writelane_b32 v39, s78, 39 +; MUBUF-NEXT: v_writelane_b32 v39, s79, 40 +; MUBUF-NEXT: v_writelane_b32 v39, s80, 41 +; MUBUF-NEXT: v_writelane_b32 v39, s81, 42 +; MUBUF-NEXT: v_writelane_b32 v39, s82, 43 +; MUBUF-NEXT: v_writelane_b32 v39, s83, 44 +; MUBUF-NEXT: v_writelane_b32 v39, s84, 45 +; MUBUF-NEXT: v_writelane_b32 v39, s85, 46 +; MUBUF-NEXT: v_writelane_b32 v39, s86, 47 +; MUBUF-NEXT: v_writelane_b32 v39, s87, 48 +; MUBUF-NEXT: v_writelane_b32 v39, s88, 49 +; MUBUF-NEXT: v_writelane_b32 v39, s89, 50 +; MUBUF-NEXT: v_writelane_b32 v39, s90, 51 +; MUBUF-NEXT: v_writelane_b32 v39, s91, 52 +; MUBUF-NEXT: v_writelane_b32 v39, s92, 53 +; MUBUF-NEXT: v_writelane_b32 v39, s93, 54 +; MUBUF-NEXT: v_writelane_b32 v39, s94, 55 +; MUBUF-NEXT: v_writelane_b32 v39, s95, 56 +; MUBUF-NEXT: v_writelane_b32 v39, s96, 57 +; MUBUF-NEXT: v_writelane_b32 v39, s97, 58 +; MUBUF-NEXT: v_writelane_b32 v39, s98, 59 +; MUBUF-NEXT: v_writelane_b32 v39, s99, 60 +; MUBUF-NEXT: v_writelane_b32 v39, s100, 61 +; MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; MUBUF-NEXT: v_writelane_b32 v39, s101, 62 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v39, s102, 63 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: v_readlane_b32 s102, v39, 63 +; MUBUF-NEXT: v_readlane_b32 s101, v39, 62 +; MUBUF-NEXT: v_readlane_b32 s100, v39, 61 +; MUBUF-NEXT: v_readlane_b32 s99, v39, 60 +; MUBUF-NEXT: v_readlane_b32 s98, v39, 59 +; MUBUF-NEXT: v_readlane_b32 s97, v39, 58 +; MUBUF-NEXT: v_readlane_b32 s96, v39, 57 +; MUBUF-NEXT: v_readlane_b32 s95, v39, 56 +; MUBUF-NEXT: v_readlane_b32 s94, v39, 55 +; MUBUF-NEXT: v_readlane_b32 s93, v39, 54 +; MUBUF-NEXT: v_readlane_b32 s92, v39, 53 +; MUBUF-NEXT: v_readlane_b32 s91, v39, 52 +; MUBUF-NEXT: v_readlane_b32 s90, v39, 51 +; MUBUF-NEXT: v_readlane_b32 s89, v39, 50 +; MUBUF-NEXT: v_readlane_b32 s88, v39, 49 +; MUBUF-NEXT: v_readlane_b32 s87, v39, 48 +; MUBUF-NEXT: v_readlane_b32 s86, v39, 47 +; MUBUF-NEXT: v_readlane_b32 s85, v39, 46 +; MUBUF-NEXT: v_readlane_b32 s84, v39, 45 +; MUBUF-NEXT: v_readlane_b32 s83, v39, 44 +; MUBUF-NEXT: v_readlane_b32 s82, v39, 43 +; MUBUF-NEXT: v_readlane_b32 s81, v39, 42 +; MUBUF-NEXT: v_readlane_b32 s80, v39, 41 +; MUBUF-NEXT: v_readlane_b32 s79, v39, 40 +; MUBUF-NEXT: v_readlane_b32 s78, v39, 39 +; MUBUF-NEXT: v_readlane_b32 s77, v39, 38 +; MUBUF-NEXT: v_readlane_b32 s76, v39, 37 +; MUBUF-NEXT: v_readlane_b32 s75, v39, 36 +; MUBUF-NEXT: v_readlane_b32 s74, v39, 35 +; MUBUF-NEXT: v_readlane_b32 s73, v39, 34 +; MUBUF-NEXT: v_readlane_b32 s72, v39, 33 +; MUBUF-NEXT: v_readlane_b32 s71, v39, 32 +; MUBUF-NEXT: v_readlane_b32 s70, v39, 31 +; MUBUF-NEXT: v_readlane_b32 s69, v39, 30 +; MUBUF-NEXT: v_readlane_b32 s68, v39, 29 +; MUBUF-NEXT: v_readlane_b32 s67, v39, 28 +; MUBUF-NEXT: v_readlane_b32 s66, v39, 27 +; MUBUF-NEXT: v_readlane_b32 s65, v39, 26 +; MUBUF-NEXT: v_readlane_b32 s64, v39, 25 +; MUBUF-NEXT: v_readlane_b32 s63, v39, 24 +; MUBUF-NEXT: v_readlane_b32 s62, v39, 23 +; MUBUF-NEXT: v_readlane_b32 s61, v39, 22 +; MUBUF-NEXT: v_readlane_b32 s60, v39, 21 +; MUBUF-NEXT: v_readlane_b32 s59, v39, 20 +; MUBUF-NEXT: v_readlane_b32 s58, v39, 19 +; MUBUF-NEXT: v_readlane_b32 s57, v39, 18 +; MUBUF-NEXT: v_readlane_b32 s56, v39, 17 +; MUBUF-NEXT: v_readlane_b32 s55, v39, 16 +; MUBUF-NEXT: v_readlane_b32 s54, v39, 15 +; MUBUF-NEXT: v_readlane_b32 s53, v39, 14 +; MUBUF-NEXT: v_readlane_b32 s52, v39, 13 +; MUBUF-NEXT: v_readlane_b32 s51, v39, 12 +; MUBUF-NEXT: v_readlane_b32 s50, v39, 11 +; MUBUF-NEXT: v_readlane_b32 s49, v39, 10 +; MUBUF-NEXT: v_readlane_b32 s48, v39, 9 +; MUBUF-NEXT: v_readlane_b32 s47, v39, 8 +; MUBUF-NEXT: v_readlane_b32 s46, v39, 7 +; MUBUF-NEXT: v_readlane_b32 s45, v39, 6 +; MUBUF-NEXT: v_readlane_b32 s44, v39, 5 +; MUBUF-NEXT: v_readlane_b32 s43, v39, 4 +; MUBUF-NEXT: v_readlane_b32 s42, v39, 3 +; MUBUF-NEXT: v_readlane_b32 s41, v39, 2 +; MUBUF-NEXT: v_readlane_b32 s40, v39, 1 +; MUBUF-NEXT: v_readlane_b32 s39, v39, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v39, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_need_to_spill_fp_to_memory_full_reserved_vgpr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v39, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v39, s39, 0 +; FLATSCR-NEXT: v_writelane_b32 v39, s40, 1 +; FLATSCR-NEXT: v_writelane_b32 v39, s41, 2 +; FLATSCR-NEXT: v_writelane_b32 v39, s42, 3 +; FLATSCR-NEXT: v_writelane_b32 v39, s43, 4 +; FLATSCR-NEXT: v_writelane_b32 v39, s44, 5 +; FLATSCR-NEXT: v_writelane_b32 v39, s45, 6 +; FLATSCR-NEXT: v_writelane_b32 v39, s46, 7 +; FLATSCR-NEXT: v_writelane_b32 v39, s47, 8 +; FLATSCR-NEXT: v_writelane_b32 v39, s48, 9 +; FLATSCR-NEXT: v_writelane_b32 v39, s49, 10 +; FLATSCR-NEXT: v_writelane_b32 v39, s50, 11 +; FLATSCR-NEXT: v_writelane_b32 v39, s51, 12 +; FLATSCR-NEXT: v_writelane_b32 v39, s52, 13 +; FLATSCR-NEXT: v_writelane_b32 v39, s53, 14 +; FLATSCR-NEXT: v_writelane_b32 v39, s54, 15 +; FLATSCR-NEXT: v_writelane_b32 v39, s55, 16 +; FLATSCR-NEXT: v_writelane_b32 v39, s56, 17 +; FLATSCR-NEXT: v_writelane_b32 v39, s57, 18 +; FLATSCR-NEXT: v_writelane_b32 v39, s58, 19 +; FLATSCR-NEXT: v_writelane_b32 v39, s59, 20 +; FLATSCR-NEXT: v_writelane_b32 v39, s60, 21 +; FLATSCR-NEXT: v_writelane_b32 v39, s61, 22 +; FLATSCR-NEXT: v_writelane_b32 v39, s62, 23 +; FLATSCR-NEXT: v_writelane_b32 v39, s63, 24 +; FLATSCR-NEXT: v_writelane_b32 v39, s64, 25 +; FLATSCR-NEXT: v_writelane_b32 v39, s65, 26 +; FLATSCR-NEXT: v_writelane_b32 v39, s66, 27 +; FLATSCR-NEXT: v_writelane_b32 v39, s67, 28 +; FLATSCR-NEXT: v_writelane_b32 v39, s68, 29 +; FLATSCR-NEXT: v_writelane_b32 v39, s69, 30 +; FLATSCR-NEXT: v_writelane_b32 v39, s70, 31 +; FLATSCR-NEXT: v_writelane_b32 v39, s71, 32 +; FLATSCR-NEXT: v_writelane_b32 v39, s72, 33 +; FLATSCR-NEXT: v_writelane_b32 v39, s73, 34 +; FLATSCR-NEXT: v_writelane_b32 v39, s74, 35 +; FLATSCR-NEXT: v_writelane_b32 v39, s75, 36 +; FLATSCR-NEXT: v_writelane_b32 v39, s76, 37 +; FLATSCR-NEXT: v_writelane_b32 v39, s77, 38 +; FLATSCR-NEXT: v_writelane_b32 v39, s78, 39 +; FLATSCR-NEXT: v_writelane_b32 v39, s79, 40 +; FLATSCR-NEXT: v_writelane_b32 v39, s80, 41 +; FLATSCR-NEXT: v_writelane_b32 v39, s81, 42 +; FLATSCR-NEXT: v_writelane_b32 v39, s82, 43 +; FLATSCR-NEXT: v_writelane_b32 v39, s83, 44 +; FLATSCR-NEXT: v_writelane_b32 v39, s84, 45 +; FLATSCR-NEXT: v_writelane_b32 v39, s85, 46 +; FLATSCR-NEXT: v_writelane_b32 v39, s86, 47 +; FLATSCR-NEXT: v_writelane_b32 v39, s87, 48 +; FLATSCR-NEXT: v_writelane_b32 v39, s88, 49 +; FLATSCR-NEXT: v_writelane_b32 v39, s89, 50 +; FLATSCR-NEXT: v_writelane_b32 v39, s90, 51 +; FLATSCR-NEXT: v_writelane_b32 v39, s91, 52 +; FLATSCR-NEXT: v_writelane_b32 v39, s92, 53 +; FLATSCR-NEXT: v_writelane_b32 v39, s93, 54 +; FLATSCR-NEXT: v_writelane_b32 v39, s94, 55 +; FLATSCR-NEXT: v_writelane_b32 v39, s95, 56 +; FLATSCR-NEXT: v_writelane_b32 v39, s96, 57 +; FLATSCR-NEXT: v_writelane_b32 v39, s97, 58 +; FLATSCR-NEXT: v_writelane_b32 v39, s98, 59 +; FLATSCR-NEXT: v_writelane_b32 v39, s99, 60 +; FLATSCR-NEXT: v_writelane_b32 v39, s100, 61 +; FLATSCR-NEXT: v_writelane_b32 v39, s101, 62 +; FLATSCR-NEXT: s_add_i32 s32, s32, 8 +; FLATSCR-NEXT: v_writelane_b32 v39, s102, 63 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s102, v39, 63 +; FLATSCR-NEXT: v_readlane_b32 s101, v39, 62 +; FLATSCR-NEXT: v_readlane_b32 s100, v39, 61 +; FLATSCR-NEXT: v_readlane_b32 s99, v39, 60 +; FLATSCR-NEXT: v_readlane_b32 s98, v39, 59 +; FLATSCR-NEXT: v_readlane_b32 s97, v39, 58 +; FLATSCR-NEXT: v_readlane_b32 s96, v39, 57 +; FLATSCR-NEXT: v_readlane_b32 s95, v39, 56 +; FLATSCR-NEXT: v_readlane_b32 s94, v39, 55 +; FLATSCR-NEXT: v_readlane_b32 s93, v39, 54 +; FLATSCR-NEXT: v_readlane_b32 s92, v39, 53 +; FLATSCR-NEXT: v_readlane_b32 s91, v39, 52 +; FLATSCR-NEXT: v_readlane_b32 s90, v39, 51 +; FLATSCR-NEXT: v_readlane_b32 s89, v39, 50 +; FLATSCR-NEXT: v_readlane_b32 s88, v39, 49 +; FLATSCR-NEXT: v_readlane_b32 s87, v39, 48 +; FLATSCR-NEXT: v_readlane_b32 s86, v39, 47 +; FLATSCR-NEXT: v_readlane_b32 s85, v39, 46 +; FLATSCR-NEXT: v_readlane_b32 s84, v39, 45 +; FLATSCR-NEXT: v_readlane_b32 s83, v39, 44 +; FLATSCR-NEXT: v_readlane_b32 s82, v39, 43 +; FLATSCR-NEXT: v_readlane_b32 s81, v39, 42 +; FLATSCR-NEXT: v_readlane_b32 s80, v39, 41 +; FLATSCR-NEXT: v_readlane_b32 s79, v39, 40 +; FLATSCR-NEXT: v_readlane_b32 s78, v39, 39 +; FLATSCR-NEXT: v_readlane_b32 s77, v39, 38 +; FLATSCR-NEXT: v_readlane_b32 s76, v39, 37 +; FLATSCR-NEXT: v_readlane_b32 s75, v39, 36 +; FLATSCR-NEXT: v_readlane_b32 s74, v39, 35 +; FLATSCR-NEXT: v_readlane_b32 s73, v39, 34 +; FLATSCR-NEXT: v_readlane_b32 s72, v39, 33 +; FLATSCR-NEXT: v_readlane_b32 s71, v39, 32 +; FLATSCR-NEXT: v_readlane_b32 s70, v39, 31 +; FLATSCR-NEXT: v_readlane_b32 s69, v39, 30 +; FLATSCR-NEXT: v_readlane_b32 s68, v39, 29 +; FLATSCR-NEXT: v_readlane_b32 s67, v39, 28 +; FLATSCR-NEXT: v_readlane_b32 s66, v39, 27 +; FLATSCR-NEXT: v_readlane_b32 s65, v39, 26 +; FLATSCR-NEXT: v_readlane_b32 s64, v39, 25 +; FLATSCR-NEXT: v_readlane_b32 s63, v39, 24 +; FLATSCR-NEXT: v_readlane_b32 s62, v39, 23 +; FLATSCR-NEXT: v_readlane_b32 s61, v39, 22 +; FLATSCR-NEXT: v_readlane_b32 s60, v39, 21 +; FLATSCR-NEXT: v_readlane_b32 s59, v39, 20 +; FLATSCR-NEXT: v_readlane_b32 s58, v39, 19 +; FLATSCR-NEXT: v_readlane_b32 s57, v39, 18 +; FLATSCR-NEXT: v_readlane_b32 s56, v39, 17 +; FLATSCR-NEXT: v_readlane_b32 s55, v39, 16 +; FLATSCR-NEXT: v_readlane_b32 s54, v39, 15 +; FLATSCR-NEXT: v_readlane_b32 s53, v39, 14 +; FLATSCR-NEXT: v_readlane_b32 s52, v39, 13 +; FLATSCR-NEXT: v_readlane_b32 s51, v39, 12 +; FLATSCR-NEXT: v_readlane_b32 s50, v39, 11 +; FLATSCR-NEXT: v_readlane_b32 s49, v39, 10 +; FLATSCR-NEXT: v_readlane_b32 s48, v39, 9 +; FLATSCR-NEXT: v_readlane_b32 s47, v39, 8 +; FLATSCR-NEXT: v_readlane_b32 s46, v39, 7 +; FLATSCR-NEXT: v_readlane_b32 s45, v39, 6 +; FLATSCR-NEXT: v_readlane_b32 s44, v39, 5 +; FLATSCR-NEXT: v_readlane_b32 s43, v39, 4 +; FLATSCR-NEXT: v_readlane_b32 s42, v39, 3 +; FLATSCR-NEXT: v_readlane_b32 s41, v39, 2 +; FLATSCR-NEXT: v_readlane_b32 s40, v39, 1 +; FLATSCR-NEXT: v_readlane_b32 s39, v39, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v39, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} @@ -619,17 +2029,312 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { ; the exec register is saved to s0 when saving CSR in the function prolog. ; Make sure that the FP save happens after restoring exec from the same ; register. -; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg: -; FLATSCR: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; FLATSCR: s_mov_b32 s33, s32 -; GCN-NOT: v_writelane_b32 v40, s33 -; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; FLATSCR: s_mov_b64 exec, [[COPY_EXEC0]] -; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NOT: v_readlane_b32 s33, v40 -; FLATSCR: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_reg() #1 { +; MUBUF-LABEL: callee_need_to_spill_fp_to_reg: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s39, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s40, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s41, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s42, 3 +; MUBUF-NEXT: v_writelane_b32 v40, s43, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s44, 5 +; MUBUF-NEXT: v_writelane_b32 v40, s45, 6 +; MUBUF-NEXT: v_writelane_b32 v40, s46, 7 +; MUBUF-NEXT: v_writelane_b32 v40, s47, 8 +; MUBUF-NEXT: v_writelane_b32 v40, s48, 9 +; MUBUF-NEXT: v_writelane_b32 v40, s49, 10 +; MUBUF-NEXT: v_writelane_b32 v40, s50, 11 +; MUBUF-NEXT: v_writelane_b32 v40, s51, 12 +; MUBUF-NEXT: v_writelane_b32 v40, s52, 13 +; MUBUF-NEXT: v_writelane_b32 v40, s53, 14 +; MUBUF-NEXT: v_writelane_b32 v40, s54, 15 +; MUBUF-NEXT: v_writelane_b32 v40, s55, 16 +; MUBUF-NEXT: v_writelane_b32 v40, s56, 17 +; MUBUF-NEXT: v_writelane_b32 v40, s57, 18 +; MUBUF-NEXT: v_writelane_b32 v40, s58, 19 +; MUBUF-NEXT: v_writelane_b32 v40, s59, 20 +; MUBUF-NEXT: v_writelane_b32 v40, s60, 21 +; MUBUF-NEXT: v_writelane_b32 v40, s61, 22 +; MUBUF-NEXT: v_writelane_b32 v40, s62, 23 +; MUBUF-NEXT: v_writelane_b32 v40, s63, 24 +; MUBUF-NEXT: v_writelane_b32 v40, s64, 25 +; MUBUF-NEXT: v_writelane_b32 v40, s65, 26 +; MUBUF-NEXT: v_writelane_b32 v40, s66, 27 +; MUBUF-NEXT: v_writelane_b32 v40, s67, 28 +; MUBUF-NEXT: v_writelane_b32 v40, s68, 29 +; MUBUF-NEXT: v_writelane_b32 v40, s69, 30 +; MUBUF-NEXT: v_writelane_b32 v40, s70, 31 +; MUBUF-NEXT: v_writelane_b32 v40, s71, 32 +; MUBUF-NEXT: v_writelane_b32 v40, s72, 33 +; MUBUF-NEXT: v_writelane_b32 v40, s73, 34 +; MUBUF-NEXT: v_writelane_b32 v40, s74, 35 +; MUBUF-NEXT: v_writelane_b32 v40, s75, 36 +; MUBUF-NEXT: v_writelane_b32 v40, s76, 37 +; MUBUF-NEXT: v_writelane_b32 v40, s77, 38 +; MUBUF-NEXT: v_writelane_b32 v40, s78, 39 +; MUBUF-NEXT: v_writelane_b32 v40, s79, 40 +; MUBUF-NEXT: v_writelane_b32 v40, s80, 41 +; MUBUF-NEXT: v_writelane_b32 v40, s81, 42 +; MUBUF-NEXT: v_writelane_b32 v40, s82, 43 +; MUBUF-NEXT: v_writelane_b32 v40, s83, 44 +; MUBUF-NEXT: v_writelane_b32 v40, s84, 45 +; MUBUF-NEXT: v_writelane_b32 v40, s85, 46 +; MUBUF-NEXT: v_writelane_b32 v40, s86, 47 +; MUBUF-NEXT: v_writelane_b32 v40, s87, 48 +; MUBUF-NEXT: v_writelane_b32 v40, s88, 49 +; MUBUF-NEXT: v_writelane_b32 v40, s89, 50 +; MUBUF-NEXT: v_writelane_b32 v40, s90, 51 +; MUBUF-NEXT: v_writelane_b32 v40, s91, 52 +; MUBUF-NEXT: v_writelane_b32 v40, s92, 53 +; MUBUF-NEXT: v_writelane_b32 v40, s93, 54 +; MUBUF-NEXT: v_writelane_b32 v40, s94, 55 +; MUBUF-NEXT: v_writelane_b32 v40, s95, 56 +; MUBUF-NEXT: v_writelane_b32 v40, s96, 57 +; MUBUF-NEXT: v_writelane_b32 v40, s97, 58 +; MUBUF-NEXT: v_writelane_b32 v40, s98, 59 +; MUBUF-NEXT: v_writelane_b32 v40, s99, 60 +; MUBUF-NEXT: v_writelane_b32 v40, s100, 61 +; MUBUF-NEXT: v_writelane_b32 v40, s101, 62 +; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 +; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: v_writelane_b32 v40, s102, 63 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s102, v40, 63 +; MUBUF-NEXT: v_readlane_b32 s101, v40, 62 +; MUBUF-NEXT: v_readlane_b32 s100, v40, 61 +; MUBUF-NEXT: v_readlane_b32 s99, v40, 60 +; MUBUF-NEXT: v_readlane_b32 s98, v40, 59 +; MUBUF-NEXT: v_readlane_b32 s97, v40, 58 +; MUBUF-NEXT: v_readlane_b32 s96, v40, 57 +; MUBUF-NEXT: v_readlane_b32 s95, v40, 56 +; MUBUF-NEXT: v_readlane_b32 s94, v40, 55 +; MUBUF-NEXT: v_readlane_b32 s93, v40, 54 +; MUBUF-NEXT: v_readlane_b32 s92, v40, 53 +; MUBUF-NEXT: v_readlane_b32 s91, v40, 52 +; MUBUF-NEXT: v_readlane_b32 s90, v40, 51 +; MUBUF-NEXT: v_readlane_b32 s89, v40, 50 +; MUBUF-NEXT: v_readlane_b32 s88, v40, 49 +; MUBUF-NEXT: v_readlane_b32 s87, v40, 48 +; MUBUF-NEXT: v_readlane_b32 s86, v40, 47 +; MUBUF-NEXT: v_readlane_b32 s85, v40, 46 +; MUBUF-NEXT: v_readlane_b32 s84, v40, 45 +; MUBUF-NEXT: v_readlane_b32 s83, v40, 44 +; MUBUF-NEXT: v_readlane_b32 s82, v40, 43 +; MUBUF-NEXT: v_readlane_b32 s81, v40, 42 +; MUBUF-NEXT: v_readlane_b32 s80, v40, 41 +; MUBUF-NEXT: v_readlane_b32 s79, v40, 40 +; MUBUF-NEXT: v_readlane_b32 s78, v40, 39 +; MUBUF-NEXT: v_readlane_b32 s77, v40, 38 +; MUBUF-NEXT: v_readlane_b32 s76, v40, 37 +; MUBUF-NEXT: v_readlane_b32 s75, v40, 36 +; MUBUF-NEXT: v_readlane_b32 s74, v40, 35 +; MUBUF-NEXT: v_readlane_b32 s73, v40, 34 +; MUBUF-NEXT: v_readlane_b32 s72, v40, 33 +; MUBUF-NEXT: v_readlane_b32 s71, v40, 32 +; MUBUF-NEXT: v_readlane_b32 s70, v40, 31 +; MUBUF-NEXT: v_readlane_b32 s69, v40, 30 +; MUBUF-NEXT: v_readlane_b32 s68, v40, 29 +; MUBUF-NEXT: v_readlane_b32 s67, v40, 28 +; MUBUF-NEXT: v_readlane_b32 s66, v40, 27 +; MUBUF-NEXT: v_readlane_b32 s65, v40, 26 +; MUBUF-NEXT: v_readlane_b32 s64, v40, 25 +; MUBUF-NEXT: v_readlane_b32 s63, v40, 24 +; MUBUF-NEXT: v_readlane_b32 s62, v40, 23 +; MUBUF-NEXT: v_readlane_b32 s61, v40, 22 +; MUBUF-NEXT: v_readlane_b32 s60, v40, 21 +; MUBUF-NEXT: v_readlane_b32 s59, v40, 20 +; MUBUF-NEXT: v_readlane_b32 s58, v40, 19 +; MUBUF-NEXT: v_readlane_b32 s57, v40, 18 +; MUBUF-NEXT: v_readlane_b32 s56, v40, 17 +; MUBUF-NEXT: v_readlane_b32 s55, v40, 16 +; MUBUF-NEXT: v_readlane_b32 s54, v40, 15 +; MUBUF-NEXT: v_readlane_b32 s53, v40, 14 +; MUBUF-NEXT: v_readlane_b32 s52, v40, 13 +; MUBUF-NEXT: v_readlane_b32 s51, v40, 12 +; MUBUF-NEXT: v_readlane_b32 s50, v40, 11 +; MUBUF-NEXT: v_readlane_b32 s49, v40, 10 +; MUBUF-NEXT: v_readlane_b32 s48, v40, 9 +; MUBUF-NEXT: v_readlane_b32 s47, v40, 8 +; MUBUF-NEXT: v_readlane_b32 s46, v40, 7 +; MUBUF-NEXT: v_readlane_b32 s45, v40, 6 +; MUBUF-NEXT: v_readlane_b32 s44, v40, 5 +; MUBUF-NEXT: v_readlane_b32 s43, v40, 4 +; MUBUF-NEXT: v_readlane_b32 s42, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s41, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s40, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s39, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v41, 0 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_need_to_spill_fp_to_reg: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s39, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s40, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s41, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s42, 3 +; FLATSCR-NEXT: v_writelane_b32 v40, s43, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s44, 5 +; FLATSCR-NEXT: v_writelane_b32 v40, s45, 6 +; FLATSCR-NEXT: v_writelane_b32 v40, s46, 7 +; FLATSCR-NEXT: v_writelane_b32 v40, s47, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s48, 9 +; FLATSCR-NEXT: v_writelane_b32 v40, s49, 10 +; FLATSCR-NEXT: v_writelane_b32 v40, s50, 11 +; FLATSCR-NEXT: v_writelane_b32 v40, s51, 12 +; FLATSCR-NEXT: v_writelane_b32 v40, s52, 13 +; FLATSCR-NEXT: v_writelane_b32 v40, s53, 14 +; FLATSCR-NEXT: v_writelane_b32 v40, s54, 15 +; FLATSCR-NEXT: v_writelane_b32 v40, s55, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s56, 17 +; FLATSCR-NEXT: v_writelane_b32 v40, s57, 18 +; FLATSCR-NEXT: v_writelane_b32 v40, s58, 19 +; FLATSCR-NEXT: v_writelane_b32 v40, s59, 20 +; FLATSCR-NEXT: v_writelane_b32 v40, s60, 21 +; FLATSCR-NEXT: v_writelane_b32 v40, s61, 22 +; FLATSCR-NEXT: v_writelane_b32 v40, s62, 23 +; FLATSCR-NEXT: v_writelane_b32 v40, s63, 24 +; FLATSCR-NEXT: v_writelane_b32 v40, s64, 25 +; FLATSCR-NEXT: v_writelane_b32 v40, s65, 26 +; FLATSCR-NEXT: v_writelane_b32 v40, s66, 27 +; FLATSCR-NEXT: v_writelane_b32 v40, s67, 28 +; FLATSCR-NEXT: v_writelane_b32 v40, s68, 29 +; FLATSCR-NEXT: v_writelane_b32 v40, s69, 30 +; FLATSCR-NEXT: v_writelane_b32 v40, s70, 31 +; FLATSCR-NEXT: v_writelane_b32 v40, s71, 32 +; FLATSCR-NEXT: v_writelane_b32 v40, s72, 33 +; FLATSCR-NEXT: v_writelane_b32 v40, s73, 34 +; FLATSCR-NEXT: v_writelane_b32 v40, s74, 35 +; FLATSCR-NEXT: v_writelane_b32 v40, s75, 36 +; FLATSCR-NEXT: v_writelane_b32 v40, s76, 37 +; FLATSCR-NEXT: v_writelane_b32 v40, s77, 38 +; FLATSCR-NEXT: v_writelane_b32 v40, s78, 39 +; FLATSCR-NEXT: v_writelane_b32 v40, s79, 40 +; FLATSCR-NEXT: v_writelane_b32 v40, s80, 41 +; FLATSCR-NEXT: v_writelane_b32 v40, s81, 42 +; FLATSCR-NEXT: v_writelane_b32 v40, s82, 43 +; FLATSCR-NEXT: v_writelane_b32 v40, s83, 44 +; FLATSCR-NEXT: v_writelane_b32 v40, s84, 45 +; FLATSCR-NEXT: v_writelane_b32 v40, s85, 46 +; FLATSCR-NEXT: v_writelane_b32 v40, s86, 47 +; FLATSCR-NEXT: v_writelane_b32 v40, s87, 48 +; FLATSCR-NEXT: v_writelane_b32 v40, s88, 49 +; FLATSCR-NEXT: v_writelane_b32 v40, s89, 50 +; FLATSCR-NEXT: v_writelane_b32 v40, s90, 51 +; FLATSCR-NEXT: v_writelane_b32 v40, s91, 52 +; FLATSCR-NEXT: v_writelane_b32 v40, s92, 53 +; FLATSCR-NEXT: v_writelane_b32 v40, s93, 54 +; FLATSCR-NEXT: v_writelane_b32 v40, s94, 55 +; FLATSCR-NEXT: v_writelane_b32 v40, s95, 56 +; FLATSCR-NEXT: v_writelane_b32 v40, s96, 57 +; FLATSCR-NEXT: v_writelane_b32 v40, s97, 58 +; FLATSCR-NEXT: v_writelane_b32 v40, s98, 59 +; FLATSCR-NEXT: v_writelane_b32 v40, s99, 60 +; FLATSCR-NEXT: v_writelane_b32 v40, s100, 61 +; FLATSCR-NEXT: v_writelane_b32 v40, s101, 62 +; FLATSCR-NEXT: s_add_i32 s32, s32, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s102, 63 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s102, v40, 63 +; FLATSCR-NEXT: v_readlane_b32 s101, v40, 62 +; FLATSCR-NEXT: v_readlane_b32 s100, v40, 61 +; FLATSCR-NEXT: v_readlane_b32 s99, v40, 60 +; FLATSCR-NEXT: v_readlane_b32 s98, v40, 59 +; FLATSCR-NEXT: v_readlane_b32 s97, v40, 58 +; FLATSCR-NEXT: v_readlane_b32 s96, v40, 57 +; FLATSCR-NEXT: v_readlane_b32 s95, v40, 56 +; FLATSCR-NEXT: v_readlane_b32 s94, v40, 55 +; FLATSCR-NEXT: v_readlane_b32 s93, v40, 54 +; FLATSCR-NEXT: v_readlane_b32 s92, v40, 53 +; FLATSCR-NEXT: v_readlane_b32 s91, v40, 52 +; FLATSCR-NEXT: v_readlane_b32 s90, v40, 51 +; FLATSCR-NEXT: v_readlane_b32 s89, v40, 50 +; FLATSCR-NEXT: v_readlane_b32 s88, v40, 49 +; FLATSCR-NEXT: v_readlane_b32 s87, v40, 48 +; FLATSCR-NEXT: v_readlane_b32 s86, v40, 47 +; FLATSCR-NEXT: v_readlane_b32 s85, v40, 46 +; FLATSCR-NEXT: v_readlane_b32 s84, v40, 45 +; FLATSCR-NEXT: v_readlane_b32 s83, v40, 44 +; FLATSCR-NEXT: v_readlane_b32 s82, v40, 43 +; FLATSCR-NEXT: v_readlane_b32 s81, v40, 42 +; FLATSCR-NEXT: v_readlane_b32 s80, v40, 41 +; FLATSCR-NEXT: v_readlane_b32 s79, v40, 40 +; FLATSCR-NEXT: v_readlane_b32 s78, v40, 39 +; FLATSCR-NEXT: v_readlane_b32 s77, v40, 38 +; FLATSCR-NEXT: v_readlane_b32 s76, v40, 37 +; FLATSCR-NEXT: v_readlane_b32 s75, v40, 36 +; FLATSCR-NEXT: v_readlane_b32 s74, v40, 35 +; FLATSCR-NEXT: v_readlane_b32 s73, v40, 34 +; FLATSCR-NEXT: v_readlane_b32 s72, v40, 33 +; FLATSCR-NEXT: v_readlane_b32 s71, v40, 32 +; FLATSCR-NEXT: v_readlane_b32 s70, v40, 31 +; FLATSCR-NEXT: v_readlane_b32 s69, v40, 30 +; FLATSCR-NEXT: v_readlane_b32 s68, v40, 29 +; FLATSCR-NEXT: v_readlane_b32 s67, v40, 28 +; FLATSCR-NEXT: v_readlane_b32 s66, v40, 27 +; FLATSCR-NEXT: v_readlane_b32 s65, v40, 26 +; FLATSCR-NEXT: v_readlane_b32 s64, v40, 25 +; FLATSCR-NEXT: v_readlane_b32 s63, v40, 24 +; FLATSCR-NEXT: v_readlane_b32 s62, v40, 23 +; FLATSCR-NEXT: v_readlane_b32 s61, v40, 22 +; FLATSCR-NEXT: v_readlane_b32 s60, v40, 21 +; FLATSCR-NEXT: v_readlane_b32 s59, v40, 20 +; FLATSCR-NEXT: v_readlane_b32 s58, v40, 19 +; FLATSCR-NEXT: v_readlane_b32 s57, v40, 18 +; FLATSCR-NEXT: v_readlane_b32 s56, v40, 17 +; FLATSCR-NEXT: v_readlane_b32 s55, v40, 16 +; FLATSCR-NEXT: v_readlane_b32 s54, v40, 15 +; FLATSCR-NEXT: v_readlane_b32 s53, v40, 14 +; FLATSCR-NEXT: v_readlane_b32 s52, v40, 13 +; FLATSCR-NEXT: v_readlane_b32 s51, v40, 12 +; FLATSCR-NEXT: v_readlane_b32 s50, v40, 11 +; FLATSCR-NEXT: v_readlane_b32 s49, v40, 10 +; FLATSCR-NEXT: v_readlane_b32 s48, v40, 9 +; FLATSCR-NEXT: v_readlane_b32 s47, v40, 8 +; FLATSCR-NEXT: v_readlane_b32 s46, v40, 7 +; FLATSCR-NEXT: v_readlane_b32 s45, v40, 6 +; FLATSCR-NEXT: v_readlane_b32 s44, v40, 5 +; FLATSCR-NEXT: v_readlane_b32 s43, v40, 4 +; FLATSCR-NEXT: v_readlane_b32 s42, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s41, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s40, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s39, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} @@ -652,20 +2357,327 @@ define void @callee_need_to_spill_fp_to_reg() #1 { ; If the size of the offset exceeds the MUBUF offset field we need another ; scratch VGPR to hold the offset. -; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset -; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; MUBUF-NEXT: s_mov_b32 s33, s32 -; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 -; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; MUBUF: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] -; GCN-NOT: v_mov_b32_e32 v0, 0x100c -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40200 -; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; FLATSCR: v_mov_b32_e32 v0, 0 -; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000 -; FLATSCR: scratch_store_dword off, v0, [[SOFF]] define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #3 { +; MUBUF-LABEL: spill_fp_to_memory_scratch_reg_needed_mubuf_offset: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: s_add_i32 s5, s33, 0x40100 +; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v39, s39, 0 +; MUBUF-NEXT: v_writelane_b32 v39, s40, 1 +; MUBUF-NEXT: v_writelane_b32 v39, s41, 2 +; MUBUF-NEXT: v_writelane_b32 v39, s42, 3 +; MUBUF-NEXT: v_writelane_b32 v39, s43, 4 +; MUBUF-NEXT: v_writelane_b32 v39, s44, 5 +; MUBUF-NEXT: v_writelane_b32 v39, s45, 6 +; MUBUF-NEXT: v_writelane_b32 v39, s46, 7 +; MUBUF-NEXT: v_writelane_b32 v39, s47, 8 +; MUBUF-NEXT: v_writelane_b32 v39, s48, 9 +; MUBUF-NEXT: v_writelane_b32 v39, s49, 10 +; MUBUF-NEXT: v_writelane_b32 v39, s50, 11 +; MUBUF-NEXT: v_writelane_b32 v39, s51, 12 +; MUBUF-NEXT: v_writelane_b32 v39, s52, 13 +; MUBUF-NEXT: v_writelane_b32 v39, s53, 14 +; MUBUF-NEXT: v_writelane_b32 v39, s54, 15 +; MUBUF-NEXT: v_writelane_b32 v39, s55, 16 +; MUBUF-NEXT: v_writelane_b32 v39, s56, 17 +; MUBUF-NEXT: v_writelane_b32 v39, s57, 18 +; MUBUF-NEXT: v_writelane_b32 v39, s58, 19 +; MUBUF-NEXT: v_writelane_b32 v39, s59, 20 +; MUBUF-NEXT: v_writelane_b32 v39, s60, 21 +; MUBUF-NEXT: v_writelane_b32 v39, s61, 22 +; MUBUF-NEXT: v_writelane_b32 v39, s62, 23 +; MUBUF-NEXT: v_writelane_b32 v39, s63, 24 +; MUBUF-NEXT: v_writelane_b32 v39, s64, 25 +; MUBUF-NEXT: v_writelane_b32 v39, s65, 26 +; MUBUF-NEXT: v_writelane_b32 v39, s66, 27 +; MUBUF-NEXT: v_writelane_b32 v39, s67, 28 +; MUBUF-NEXT: v_writelane_b32 v39, s68, 29 +; MUBUF-NEXT: v_writelane_b32 v39, s69, 30 +; MUBUF-NEXT: v_writelane_b32 v39, s70, 31 +; MUBUF-NEXT: v_writelane_b32 v39, s71, 32 +; MUBUF-NEXT: v_writelane_b32 v39, s72, 33 +; MUBUF-NEXT: v_writelane_b32 v39, s73, 34 +; MUBUF-NEXT: v_writelane_b32 v39, s74, 35 +; MUBUF-NEXT: v_writelane_b32 v39, s75, 36 +; MUBUF-NEXT: v_writelane_b32 v39, s76, 37 +; MUBUF-NEXT: v_writelane_b32 v39, s77, 38 +; MUBUF-NEXT: v_writelane_b32 v39, s78, 39 +; MUBUF-NEXT: v_writelane_b32 v39, s79, 40 +; MUBUF-NEXT: v_writelane_b32 v39, s80, 41 +; MUBUF-NEXT: v_writelane_b32 v39, s81, 42 +; MUBUF-NEXT: v_writelane_b32 v39, s82, 43 +; MUBUF-NEXT: v_writelane_b32 v39, s83, 44 +; MUBUF-NEXT: v_writelane_b32 v39, s84, 45 +; MUBUF-NEXT: v_writelane_b32 v39, s85, 46 +; MUBUF-NEXT: v_writelane_b32 v39, s86, 47 +; MUBUF-NEXT: v_writelane_b32 v39, s87, 48 +; MUBUF-NEXT: v_writelane_b32 v39, s88, 49 +; MUBUF-NEXT: v_writelane_b32 v39, s89, 50 +; MUBUF-NEXT: v_writelane_b32 v39, s90, 51 +; MUBUF-NEXT: v_writelane_b32 v39, s91, 52 +; MUBUF-NEXT: v_writelane_b32 v39, s92, 53 +; MUBUF-NEXT: v_writelane_b32 v39, s93, 54 +; MUBUF-NEXT: v_writelane_b32 v39, s94, 55 +; MUBUF-NEXT: v_writelane_b32 v39, s95, 56 +; MUBUF-NEXT: v_writelane_b32 v39, s96, 57 +; MUBUF-NEXT: v_writelane_b32 v39, s97, 58 +; MUBUF-NEXT: v_writelane_b32 v39, s98, 59 +; MUBUF-NEXT: v_writelane_b32 v39, s99, 60 +; MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; MUBUF-NEXT: s_add_i32 s5, s33, 0x40200 +; MUBUF-NEXT: v_writelane_b32 v39, s100, 61 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v39, s101, 62 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1000 +; MUBUF-NEXT: v_writelane_b32 v39, s102, 63 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_add_i32 s5, s33, 0x40200 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload +; MUBUF-NEXT: s_add_i32 s32, s32, 0x40400 +; MUBUF-NEXT: v_readlane_b32 s102, v39, 63 +; MUBUF-NEXT: v_readlane_b32 s101, v39, 62 +; MUBUF-NEXT: v_readlane_b32 s100, v39, 61 +; MUBUF-NEXT: v_readlane_b32 s99, v39, 60 +; MUBUF-NEXT: v_readlane_b32 s98, v39, 59 +; MUBUF-NEXT: v_readlane_b32 s97, v39, 58 +; MUBUF-NEXT: v_readlane_b32 s96, v39, 57 +; MUBUF-NEXT: v_readlane_b32 s95, v39, 56 +; MUBUF-NEXT: v_readlane_b32 s94, v39, 55 +; MUBUF-NEXT: v_readlane_b32 s93, v39, 54 +; MUBUF-NEXT: v_readlane_b32 s92, v39, 53 +; MUBUF-NEXT: v_readlane_b32 s91, v39, 52 +; MUBUF-NEXT: v_readlane_b32 s90, v39, 51 +; MUBUF-NEXT: v_readlane_b32 s89, v39, 50 +; MUBUF-NEXT: v_readlane_b32 s88, v39, 49 +; MUBUF-NEXT: v_readlane_b32 s87, v39, 48 +; MUBUF-NEXT: v_readlane_b32 s86, v39, 47 +; MUBUF-NEXT: v_readlane_b32 s85, v39, 46 +; MUBUF-NEXT: v_readlane_b32 s84, v39, 45 +; MUBUF-NEXT: v_readlane_b32 s83, v39, 44 +; MUBUF-NEXT: v_readlane_b32 s82, v39, 43 +; MUBUF-NEXT: v_readlane_b32 s81, v39, 42 +; MUBUF-NEXT: v_readlane_b32 s80, v39, 41 +; MUBUF-NEXT: v_readlane_b32 s79, v39, 40 +; MUBUF-NEXT: v_readlane_b32 s78, v39, 39 +; MUBUF-NEXT: v_readlane_b32 s77, v39, 38 +; MUBUF-NEXT: v_readlane_b32 s76, v39, 37 +; MUBUF-NEXT: v_readlane_b32 s75, v39, 36 +; MUBUF-NEXT: v_readlane_b32 s74, v39, 35 +; MUBUF-NEXT: v_readlane_b32 s73, v39, 34 +; MUBUF-NEXT: v_readlane_b32 s72, v39, 33 +; MUBUF-NEXT: v_readlane_b32 s71, v39, 32 +; MUBUF-NEXT: v_readlane_b32 s70, v39, 31 +; MUBUF-NEXT: v_readlane_b32 s69, v39, 30 +; MUBUF-NEXT: v_readlane_b32 s68, v39, 29 +; MUBUF-NEXT: v_readlane_b32 s67, v39, 28 +; MUBUF-NEXT: v_readlane_b32 s66, v39, 27 +; MUBUF-NEXT: v_readlane_b32 s65, v39, 26 +; MUBUF-NEXT: v_readlane_b32 s64, v39, 25 +; MUBUF-NEXT: v_readlane_b32 s63, v39, 24 +; MUBUF-NEXT: v_readlane_b32 s62, v39, 23 +; MUBUF-NEXT: v_readlane_b32 s61, v39, 22 +; MUBUF-NEXT: v_readlane_b32 s60, v39, 21 +; MUBUF-NEXT: v_readlane_b32 s59, v39, 20 +; MUBUF-NEXT: v_readlane_b32 s58, v39, 19 +; MUBUF-NEXT: v_readlane_b32 s57, v39, 18 +; MUBUF-NEXT: v_readlane_b32 s56, v39, 17 +; MUBUF-NEXT: v_readlane_b32 s55, v39, 16 +; MUBUF-NEXT: v_readlane_b32 s54, v39, 15 +; MUBUF-NEXT: v_readlane_b32 s53, v39, 14 +; MUBUF-NEXT: v_readlane_b32 s52, v39, 13 +; MUBUF-NEXT: v_readlane_b32 s51, v39, 12 +; MUBUF-NEXT: v_readlane_b32 s50, v39, 11 +; MUBUF-NEXT: v_readlane_b32 s49, v39, 10 +; MUBUF-NEXT: v_readlane_b32 s48, v39, 9 +; MUBUF-NEXT: v_readlane_b32 s47, v39, 8 +; MUBUF-NEXT: v_readlane_b32 s46, v39, 7 +; MUBUF-NEXT: v_readlane_b32 s45, v39, 6 +; MUBUF-NEXT: v_readlane_b32 s44, v39, 5 +; MUBUF-NEXT: v_readlane_b32 s43, v39, 4 +; MUBUF-NEXT: v_readlane_b32 s42, v39, 3 +; MUBUF-NEXT: v_readlane_b32 s41, v39, 2 +; MUBUF-NEXT: v_readlane_b32 s40, v39, 1 +; MUBUF-NEXT: v_readlane_b32 s39, v39, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: s_add_i32 s5, s33, 0x40100 +; MUBUF-NEXT: buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: spill_fp_to_memory_scratch_reg_needed_mubuf_offset: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: s_add_i32 s1, s33, 0x1004 +; FLATSCR-NEXT: scratch_store_dword off, v39, s1 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v39, s39, 0 +; FLATSCR-NEXT: v_writelane_b32 v39, s40, 1 +; FLATSCR-NEXT: v_writelane_b32 v39, s41, 2 +; FLATSCR-NEXT: v_writelane_b32 v39, s42, 3 +; FLATSCR-NEXT: v_writelane_b32 v39, s43, 4 +; FLATSCR-NEXT: v_writelane_b32 v39, s44, 5 +; FLATSCR-NEXT: v_writelane_b32 v39, s45, 6 +; FLATSCR-NEXT: v_writelane_b32 v39, s46, 7 +; FLATSCR-NEXT: v_writelane_b32 v39, s47, 8 +; FLATSCR-NEXT: v_writelane_b32 v39, s48, 9 +; FLATSCR-NEXT: v_writelane_b32 v39, s49, 10 +; FLATSCR-NEXT: v_writelane_b32 v39, s50, 11 +; FLATSCR-NEXT: v_writelane_b32 v39, s51, 12 +; FLATSCR-NEXT: v_writelane_b32 v39, s52, 13 +; FLATSCR-NEXT: v_writelane_b32 v39, s53, 14 +; FLATSCR-NEXT: v_writelane_b32 v39, s54, 15 +; FLATSCR-NEXT: v_writelane_b32 v39, s55, 16 +; FLATSCR-NEXT: v_writelane_b32 v39, s56, 17 +; FLATSCR-NEXT: v_writelane_b32 v39, s57, 18 +; FLATSCR-NEXT: v_writelane_b32 v39, s58, 19 +; FLATSCR-NEXT: v_writelane_b32 v39, s59, 20 +; FLATSCR-NEXT: v_writelane_b32 v39, s60, 21 +; FLATSCR-NEXT: v_writelane_b32 v39, s61, 22 +; FLATSCR-NEXT: v_writelane_b32 v39, s62, 23 +; FLATSCR-NEXT: v_writelane_b32 v39, s63, 24 +; FLATSCR-NEXT: v_writelane_b32 v39, s64, 25 +; FLATSCR-NEXT: v_writelane_b32 v39, s65, 26 +; FLATSCR-NEXT: v_writelane_b32 v39, s66, 27 +; FLATSCR-NEXT: v_writelane_b32 v39, s67, 28 +; FLATSCR-NEXT: v_writelane_b32 v39, s68, 29 +; FLATSCR-NEXT: v_writelane_b32 v39, s69, 30 +; FLATSCR-NEXT: v_writelane_b32 v39, s70, 31 +; FLATSCR-NEXT: v_writelane_b32 v39, s71, 32 +; FLATSCR-NEXT: v_writelane_b32 v39, s72, 33 +; FLATSCR-NEXT: v_writelane_b32 v39, s73, 34 +; FLATSCR-NEXT: v_writelane_b32 v39, s74, 35 +; FLATSCR-NEXT: v_writelane_b32 v39, s75, 36 +; FLATSCR-NEXT: v_writelane_b32 v39, s76, 37 +; FLATSCR-NEXT: v_writelane_b32 v39, s77, 38 +; FLATSCR-NEXT: v_writelane_b32 v39, s78, 39 +; FLATSCR-NEXT: v_writelane_b32 v39, s79, 40 +; FLATSCR-NEXT: v_writelane_b32 v39, s80, 41 +; FLATSCR-NEXT: v_writelane_b32 v39, s81, 42 +; FLATSCR-NEXT: v_writelane_b32 v39, s82, 43 +; FLATSCR-NEXT: v_writelane_b32 v39, s83, 44 +; FLATSCR-NEXT: v_writelane_b32 v39, s84, 45 +; FLATSCR-NEXT: v_writelane_b32 v39, s85, 46 +; FLATSCR-NEXT: v_writelane_b32 v39, s86, 47 +; FLATSCR-NEXT: v_writelane_b32 v39, s87, 48 +; FLATSCR-NEXT: v_writelane_b32 v39, s88, 49 +; FLATSCR-NEXT: v_writelane_b32 v39, s89, 50 +; FLATSCR-NEXT: v_writelane_b32 v39, s90, 51 +; FLATSCR-NEXT: v_writelane_b32 v39, s91, 52 +; FLATSCR-NEXT: v_writelane_b32 v39, s92, 53 +; FLATSCR-NEXT: v_writelane_b32 v39, s93, 54 +; FLATSCR-NEXT: v_writelane_b32 v39, s94, 55 +; FLATSCR-NEXT: v_writelane_b32 v39, s95, 56 +; FLATSCR-NEXT: v_writelane_b32 v39, s96, 57 +; FLATSCR-NEXT: v_writelane_b32 v39, s97, 58 +; FLATSCR-NEXT: v_writelane_b32 v39, s98, 59 +; FLATSCR-NEXT: v_writelane_b32 v39, s99, 60 +; FLATSCR-NEXT: s_addk_i32 s32, 0x100c +; FLATSCR-NEXT: v_writelane_b32 v39, s100, 61 +; FLATSCR-NEXT: v_writelane_b32 v39, s101, 62 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s1, s33, 0x1000 +; FLATSCR-NEXT: v_writelane_b32 v39, s102, 63 +; FLATSCR-NEXT: scratch_store_dword off, v0, s1 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s102, v39, 63 +; FLATSCR-NEXT: v_readlane_b32 s101, v39, 62 +; FLATSCR-NEXT: v_readlane_b32 s100, v39, 61 +; FLATSCR-NEXT: v_readlane_b32 s99, v39, 60 +; FLATSCR-NEXT: v_readlane_b32 s98, v39, 59 +; FLATSCR-NEXT: v_readlane_b32 s97, v39, 58 +; FLATSCR-NEXT: v_readlane_b32 s96, v39, 57 +; FLATSCR-NEXT: v_readlane_b32 s95, v39, 56 +; FLATSCR-NEXT: v_readlane_b32 s94, v39, 55 +; FLATSCR-NEXT: v_readlane_b32 s93, v39, 54 +; FLATSCR-NEXT: v_readlane_b32 s92, v39, 53 +; FLATSCR-NEXT: v_readlane_b32 s91, v39, 52 +; FLATSCR-NEXT: v_readlane_b32 s90, v39, 51 +; FLATSCR-NEXT: v_readlane_b32 s89, v39, 50 +; FLATSCR-NEXT: v_readlane_b32 s88, v39, 49 +; FLATSCR-NEXT: v_readlane_b32 s87, v39, 48 +; FLATSCR-NEXT: v_readlane_b32 s86, v39, 47 +; FLATSCR-NEXT: v_readlane_b32 s85, v39, 46 +; FLATSCR-NEXT: v_readlane_b32 s84, v39, 45 +; FLATSCR-NEXT: v_readlane_b32 s83, v39, 44 +; FLATSCR-NEXT: v_readlane_b32 s82, v39, 43 +; FLATSCR-NEXT: v_readlane_b32 s81, v39, 42 +; FLATSCR-NEXT: v_readlane_b32 s80, v39, 41 +; FLATSCR-NEXT: v_readlane_b32 s79, v39, 40 +; FLATSCR-NEXT: v_readlane_b32 s78, v39, 39 +; FLATSCR-NEXT: v_readlane_b32 s77, v39, 38 +; FLATSCR-NEXT: v_readlane_b32 s76, v39, 37 +; FLATSCR-NEXT: v_readlane_b32 s75, v39, 36 +; FLATSCR-NEXT: v_readlane_b32 s74, v39, 35 +; FLATSCR-NEXT: v_readlane_b32 s73, v39, 34 +; FLATSCR-NEXT: v_readlane_b32 s72, v39, 33 +; FLATSCR-NEXT: v_readlane_b32 s71, v39, 32 +; FLATSCR-NEXT: v_readlane_b32 s70, v39, 31 +; FLATSCR-NEXT: v_readlane_b32 s69, v39, 30 +; FLATSCR-NEXT: v_readlane_b32 s68, v39, 29 +; FLATSCR-NEXT: v_readlane_b32 s67, v39, 28 +; FLATSCR-NEXT: v_readlane_b32 s66, v39, 27 +; FLATSCR-NEXT: v_readlane_b32 s65, v39, 26 +; FLATSCR-NEXT: v_readlane_b32 s64, v39, 25 +; FLATSCR-NEXT: v_readlane_b32 s63, v39, 24 +; FLATSCR-NEXT: v_readlane_b32 s62, v39, 23 +; FLATSCR-NEXT: v_readlane_b32 s61, v39, 22 +; FLATSCR-NEXT: v_readlane_b32 s60, v39, 21 +; FLATSCR-NEXT: v_readlane_b32 s59, v39, 20 +; FLATSCR-NEXT: v_readlane_b32 s58, v39, 19 +; FLATSCR-NEXT: v_readlane_b32 s57, v39, 18 +; FLATSCR-NEXT: v_readlane_b32 s56, v39, 17 +; FLATSCR-NEXT: v_readlane_b32 s55, v39, 16 +; FLATSCR-NEXT: v_readlane_b32 s54, v39, 15 +; FLATSCR-NEXT: v_readlane_b32 s53, v39, 14 +; FLATSCR-NEXT: v_readlane_b32 s52, v39, 13 +; FLATSCR-NEXT: v_readlane_b32 s51, v39, 12 +; FLATSCR-NEXT: v_readlane_b32 s50, v39, 11 +; FLATSCR-NEXT: v_readlane_b32 s49, v39, 10 +; FLATSCR-NEXT: v_readlane_b32 s48, v39, 9 +; FLATSCR-NEXT: v_readlane_b32 s47, v39, 8 +; FLATSCR-NEXT: v_readlane_b32 s46, v39, 7 +; FLATSCR-NEXT: v_readlane_b32 s45, v39, 6 +; FLATSCR-NEXT: v_readlane_b32 s44, v39, 5 +; FLATSCR-NEXT: v_readlane_b32 s43, v39, 4 +; FLATSCR-NEXT: v_readlane_b32 s42, v39, 3 +; FLATSCR-NEXT: v_readlane_b32 s41, v39, 2 +; FLATSCR-NEXT: v_readlane_b32 s40, v39, 1 +; FLATSCR-NEXT: v_readlane_b32 s39, v39, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: s_add_i32 s1, s33, 0x1004 +; FLATSCR-NEXT: scratch_load_dword v39, off, s1 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll index 593f40fd1b25e..1821872b82c0a 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -1,51 +1,83 @@ -; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s +; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; Test calls when called by other callable functions rather than ; kernels. declare void @external_void_func_i32(i32) #0 -; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm: -; GCN: s_waitcnt - ; Spill CSR VGPR used for SGPR spilling -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2 -; GCN-DAG: v_writelane_b32 v40, s30, 0 -; GCN-DAG: v_writelane_b32 v40, s31, 1 - -; GCN: s_swappc_b64 - -; GCN: v_readlane_b32 s31, v40, 1 -; GCN: v_readlane_b32 s30, v40, 0 -; GCN: s_mov_b32 s32, s33 - -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] define void @test_func_call_external_void_func_i32_imm() #0 { +; GCN-LABEL: test_func_call_external_void_func_i32_imm: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s16, 2 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, external_void_func_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, external_void_func_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 42 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_i32(i32 42) ret void } -; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use: -; GCN: s_waitcnt -; GCN: s_mov_b32 s33, s32 -; GCN-DAG: s_addk_i32 s32, 0x1400{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: -; GCN: s_swappc_b64 -; GCN: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { +; GCN-LABEL: test_func_call_external_void_func_i32_imm_stack_use: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: s_addk_i32 s32, 0x1400 +; GCN-NEXT: v_writelane_b32 v40, s16, 2 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, external_void_func_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, external_void_func_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 42 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [16 x i32], align 4, addrspace(5) %gep15 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 16 store volatile i32 0, ptr addrspace(5) %alloca @@ -57,3 +89,7 @@ define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind noinline } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; FIJI: {{.*}} +; GFX9: {{.*}} +; HAWAII: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 0676bc79a46f5..cd7f0c62b0011 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -1,29 +1,59 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s target datalayout = "A5" ; FIXME: Why is this commuted only sometimes? -; GCN-LABEL: {{^}}i32_fastcc_i32_i32: -; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { +; FIJI-LABEL: i32_fastcc_i32_i32: +; FIJI: ; %bb.0: +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; FIJI-NEXT: s_setpc_b64 s[30:31] +; +; HAWAII-LABEL: i32_fastcc_i32_i32: +; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; HAWAII-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: i32_fastcc_i32_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] %add0 = add i32 %arg0, %arg1 ret i32 %add0 } -; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: -; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9 -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20 -; GCN: s_waitcnt vmcnt(0) -; GCN: s_setpc_b64 -; GCN: ; ScratchSize: 68 define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { +; FIJI-LABEL: i32_fastcc_i32_i32_stack_object: +; FIJI: ; %bb.0: +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, 9 +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[30:31] +; +; HAWAII-LABEL: i32_fastcc_i32_i32_stack_object: +; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, 9 +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: i32_fastcc_i32_i32_stack_object: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 store volatile i32 9, ptr addrspace(5) %gep @@ -31,19 +61,33 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { ret i32 %add0 } -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32: define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret i32 %ret } -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object: -; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20 -; GCN: s_setpc_b64 -; GCN: ; ScratchSize: 68 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -52,12 +96,18 @@ entry: ret i32 %ret } -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object: -; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20 -; GCN: s_setpc_b64 -; GCN: ; ScratchSize: 136 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -66,45 +116,143 @@ entry: ret i32 %ret } -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result: define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret void } ; It doesn't make sense to do a tail from a kernel -; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result: -;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { +; FIJI-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_add_i32 s6, s6, s9 +; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; FIJI-NEXT: s_add_u32 s0, s0, s9 +; FIJI-NEXT: s_addc_u32 s1, s1, 0 +; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: s_getpc_b64 s[6:7] +; FIJI-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; FIJI-NEXT: s_mov_b32 s32, 0 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v0, s4 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: s_swappc_b64 s[30:31], s[6:7] +; FIJI-NEXT: s_endpgm +; +; HAWAII-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_add_i32 s6, s6, s9 +; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HAWAII-NEXT: s_add_u32 s0, s0, s9 +; HAWAII-NEXT: s_addc_u32 s1, s1, 0 +; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: s_getpc_b64 s[6:7] +; HAWAII-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; HAWAII-NEXT: s_mov_b32 s32, 0 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v0, s4 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 +; HAWAII-NEXT: s_swappc_b64 s[30:31], s[6:7] +; HAWAII-NEXT: s_endpgm +; +; GFX9-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_endpgm entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret void } -; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32: -; GCN: s_waitcnt -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) - -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 - -; GCN-NEXT: s_setpc_b64 s[30:31] define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) byval(i32) align 4 %arg1) #1 { +; FIJI-LABEL: i32_fastcc_i32_byval_i32: +; FIJI: ; %bb.0: +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; FIJI-NEXT: s_setpc_b64 s[30:31] +; +; HAWAII-LABEL: i32_fastcc_i32_byval_i32: +; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; HAWAII-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: i32_fastcc_i32_byval_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] %arg1.load = load i32, ptr addrspace(5) %arg1, align 4 %add0 = add i32 %arg0, %arg1.load ret i32 %add0 } ; Tail call disallowed with byval in parent. -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent: -; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} -; GCN: s_swappc_b64 -; GCN-NOT: v_readlane_b32 s32 -; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, ptr addrspace(5) byval(i32) %b.byval, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) %b.byval) ret i32 %ret @@ -113,34 +261,56 @@ entry: ; Tail call disallowed with byval in parent, not callee. The stack ; usage of incoming arguments must be <= the outgoing stack ; arguments. - -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: -; GCN-NOT: v0 -; GCN-NOT: s32 -; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16 -; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} -; GCN-NEXT: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) inttoptr (i32 16 to ptr addrspace(5))) ret i32 %ret } -; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: -; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} - -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 -; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] -; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]] - - -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9: v_add3_u32 v0, v0, v3, v2 - -; GCN-NEXT: s_setpc_b64 define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { +; FIJI-LABEL: i32_fastcc_i32_i32_a32i32: +; FIJI: ; %bb.0: +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; FIJI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; FIJI-NEXT: s_waitcnt vmcnt(1) +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; FIJI-NEXT: s_setpc_b64 s[30:31] +; +; HAWAII-LABEL: i32_fastcc_i32_i32_a32i32: +; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; HAWAII-NEXT: s_waitcnt vmcnt(1) +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; HAWAII-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: i32_fastcc_i32_i32_a32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add3_u32 v0, v0, v3, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] %val_firststack = extractvalue [32 x i32] %large, 30 %val_laststack = extractvalue [32 x i32] %large, 31 %add0 = add i32 %arg0, %arg1 @@ -150,31 +320,49 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l } ; FIXME: Why load and store same location for stack args? -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: - -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD_2:v[0-9]+]], off, s[0:3], s32 offset:8 - -; GCN-NOT: s32 - -; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword [[LOAD_2]], off, s[0:3], s32 offset:8 - -; GCN-NOT: s32 -; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) ret i32 %ret } -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: -; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32 -; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v34, 9 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -186,54 +374,114 @@ entry: ; If the callee requires more stack argument space than the caller, ; don't do a tail call. ; TODO: Do we really need this restriction? - -; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space: -; GCN: s_swappc_b64 -; GCN: s_setpc_b64 define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { +; GCN-LABEL: no_sibling_call_callee_more_stack_space: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) ret i32 %ret } ; Have another non-tail in the function -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec -; GCN-DAG: s_addk_i32 s32, 0x400 -; GCN: v_writelane_b32 [[CSRV]], [[FP_SCRATCH_COPY]], 2 - -; GCN-DAG: s_getpc_b64 s[4:5] -; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 - -; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0 -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1 - - -; GCN: s_swappc_b64 - -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload - -; GCN: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1 -; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0 -; GCN-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v42, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v42, s30, 0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v42, s31, 1 +; GCN-NEXT: v_mov_b32_e32 v40, v1 +; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v41 +; GCN-NEXT: v_mov_b32_e32 v1, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 +; GCN-NEXT: v_readlane_b32 s31, v42, 1 +; GCN-NEXT: v_readlane_b32 s30, v42, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s6, v42, 2 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call) @@ -242,16 +490,25 @@ entry: ; Have stack object in caller and stack passed arguments. SP should be ; in same place at function exit. - -; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: -; GCN-NOT: s33 -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset: - -; GCN-NOT: s33 - -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: -; GCN: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { +; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v34, 9 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -260,13 +517,52 @@ entry: ret i32 %ret } -; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: -; GCN-NOT: s33 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48 - -; GCN-NOT: s33 -; GCN: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { +; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -278,11 +574,18 @@ entry: @func_ptr_gv = external unnamed_addr addrspace(4) constant ptr, align 4 ; Do support tail calls with a uniform, but unknown, callee. -; GCN-LABEL: {{^}}indirect_uniform_sibling_call_i32_fastcc_i32_i32: -; GCN: s_load_dwordx2 [[GV_ADDR:s\[[0-9]+:[0-9]+\]]] -; GCN: s_load_dwordx2 [[FUNC_PTR:s\[[0-9]+:[0-9]+\]]], [[GV_ADDR]] -; GCN: s_setpc_b64 [[FUNC_PTR]] define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %func.ptr.load = load ptr, ptr addrspace(4) @func_ptr_gv %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b) @@ -291,14 +594,279 @@ entry: ; We can't support a tail call to a divergent target. Use a waterfall ; loop around a regular call -; GCN-LABEL: {{^}}indirect_divergent_sibling_call_i32_fastcc_i32_i32: -; GCN: v_readfirstlane_b32 -; GCN: v_readfirstlane_b32 -; GCN: s_and_saveexec_b64 -; GCN: s_swappc_b64 -; GCN: s_cbranch_execnz -; GCN: s_setpc_b64 define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr %func.ptr, i32 %a, i32 %b, i32 %c) #1 { +; FIJI-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_mov_b32 s16, s33 +; FIJI-NEXT: s_mov_b32 s33, s32 +; FIJI-NEXT: s_or_saveexec_b64 s[18:19], -1 +; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; FIJI-NEXT: s_mov_b64 exec, s[18:19] +; FIJI-NEXT: v_writelane_b32 v40, s16, 18 +; FIJI-NEXT: v_writelane_b32 v40, s30, 0 +; FIJI-NEXT: v_writelane_b32 v40, s31, 1 +; FIJI-NEXT: v_writelane_b32 v40, s34, 2 +; FIJI-NEXT: v_writelane_b32 v40, s35, 3 +; FIJI-NEXT: v_writelane_b32 v40, s36, 4 +; FIJI-NEXT: v_writelane_b32 v40, s37, 5 +; FIJI-NEXT: v_writelane_b32 v40, s38, 6 +; FIJI-NEXT: v_writelane_b32 v40, s39, 7 +; FIJI-NEXT: v_writelane_b32 v40, s40, 8 +; FIJI-NEXT: v_writelane_b32 v40, s41, 9 +; FIJI-NEXT: v_writelane_b32 v40, s42, 10 +; FIJI-NEXT: v_writelane_b32 v40, s43, 11 +; FIJI-NEXT: v_writelane_b32 v40, s44, 12 +; FIJI-NEXT: v_writelane_b32 v40, s45, 13 +; FIJI-NEXT: v_writelane_b32 v40, s46, 14 +; FIJI-NEXT: v_writelane_b32 v40, s47, 15 +; FIJI-NEXT: v_writelane_b32 v40, s48, 16 +; FIJI-NEXT: s_mov_b32 s42, s15 +; FIJI-NEXT: s_mov_b32 s43, s14 +; FIJI-NEXT: s_mov_b32 s44, s13 +; FIJI-NEXT: s_mov_b32 s45, s12 +; FIJI-NEXT: s_mov_b64 s[34:35], s[10:11] +; FIJI-NEXT: s_mov_b64 s[36:37], s[8:9] +; FIJI-NEXT: s_mov_b64 s[38:39], s[6:7] +; FIJI-NEXT: s_mov_b64 s[40:41], s[4:5] +; FIJI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; FIJI-NEXT: s_mov_b64 s[46:47], exec +; FIJI-NEXT: s_addk_i32 s32, 0x400 +; FIJI-NEXT: v_writelane_b32 v40, s49, 17 +; FIJI-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; FIJI-NEXT: v_readfirstlane_b32 s16, v0 +; FIJI-NEXT: v_readfirstlane_b32 s17, v1 +; FIJI-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] +; FIJI-NEXT: s_and_saveexec_b64 s[48:49], vcc +; FIJI-NEXT: s_mov_b64 s[4:5], s[40:41] +; FIJI-NEXT: s_mov_b64 s[6:7], s[38:39] +; FIJI-NEXT: s_mov_b64 s[8:9], s[36:37] +; FIJI-NEXT: s_mov_b64 s[10:11], s[34:35] +; FIJI-NEXT: s_mov_b32 s12, s45 +; FIJI-NEXT: s_mov_b32 s13, s44 +; FIJI-NEXT: s_mov_b32 s14, s43 +; FIJI-NEXT: s_mov_b32 s15, s42 +; FIJI-NEXT: v_mov_b32_e32 v0, v2 +; FIJI-NEXT: v_mov_b32_e32 v1, v3 +; FIJI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FIJI-NEXT: v_mov_b32_e32 v4, v0 +; FIJI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; FIJI-NEXT: ; implicit-def: $vgpr31 +; FIJI-NEXT: ; implicit-def: $vgpr2 +; FIJI-NEXT: ; implicit-def: $vgpr3 +; FIJI-NEXT: s_xor_b64 exec, exec, s[48:49] +; FIJI-NEXT: s_cbranch_execnz .LBB18_1 +; FIJI-NEXT: ; %bb.2: +; FIJI-NEXT: s_mov_b64 exec, s[46:47] +; FIJI-NEXT: v_mov_b32_e32 v0, v4 +; FIJI-NEXT: v_readlane_b32 s49, v40, 17 +; FIJI-NEXT: v_readlane_b32 s48, v40, 16 +; FIJI-NEXT: v_readlane_b32 s47, v40, 15 +; FIJI-NEXT: v_readlane_b32 s46, v40, 14 +; FIJI-NEXT: v_readlane_b32 s45, v40, 13 +; FIJI-NEXT: v_readlane_b32 s44, v40, 12 +; FIJI-NEXT: v_readlane_b32 s43, v40, 11 +; FIJI-NEXT: v_readlane_b32 s42, v40, 10 +; FIJI-NEXT: v_readlane_b32 s41, v40, 9 +; FIJI-NEXT: v_readlane_b32 s40, v40, 8 +; FIJI-NEXT: v_readlane_b32 s39, v40, 7 +; FIJI-NEXT: v_readlane_b32 s38, v40, 6 +; FIJI-NEXT: v_readlane_b32 s37, v40, 5 +; FIJI-NEXT: v_readlane_b32 s36, v40, 4 +; FIJI-NEXT: v_readlane_b32 s35, v40, 3 +; FIJI-NEXT: v_readlane_b32 s34, v40, 2 +; FIJI-NEXT: v_readlane_b32 s31, v40, 1 +; FIJI-NEXT: v_readlane_b32 s30, v40, 0 +; FIJI-NEXT: s_mov_b32 s32, s33 +; FIJI-NEXT: v_readlane_b32 s4, v40, 18 +; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; FIJI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; FIJI-NEXT: s_mov_b64 exec, s[6:7] +; FIJI-NEXT: s_mov_b32 s33, s4 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[30:31] +; +; HAWAII-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_mov_b32 s16, s33 +; HAWAII-NEXT: s_mov_b32 s33, s32 +; HAWAII-NEXT: s_or_saveexec_b64 s[18:19], -1 +; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HAWAII-NEXT: s_mov_b64 exec, s[18:19] +; HAWAII-NEXT: v_writelane_b32 v40, s16, 18 +; HAWAII-NEXT: v_writelane_b32 v40, s30, 0 +; HAWAII-NEXT: v_writelane_b32 v40, s31, 1 +; HAWAII-NEXT: v_writelane_b32 v40, s34, 2 +; HAWAII-NEXT: v_writelane_b32 v40, s35, 3 +; HAWAII-NEXT: v_writelane_b32 v40, s36, 4 +; HAWAII-NEXT: v_writelane_b32 v40, s37, 5 +; HAWAII-NEXT: v_writelane_b32 v40, s38, 6 +; HAWAII-NEXT: v_writelane_b32 v40, s39, 7 +; HAWAII-NEXT: v_writelane_b32 v40, s40, 8 +; HAWAII-NEXT: v_writelane_b32 v40, s41, 9 +; HAWAII-NEXT: v_writelane_b32 v40, s42, 10 +; HAWAII-NEXT: v_writelane_b32 v40, s43, 11 +; HAWAII-NEXT: v_writelane_b32 v40, s44, 12 +; HAWAII-NEXT: v_writelane_b32 v40, s45, 13 +; HAWAII-NEXT: v_writelane_b32 v40, s46, 14 +; HAWAII-NEXT: v_writelane_b32 v40, s47, 15 +; HAWAII-NEXT: v_writelane_b32 v40, s48, 16 +; HAWAII-NEXT: s_mov_b32 s42, s15 +; HAWAII-NEXT: s_mov_b32 s43, s14 +; HAWAII-NEXT: s_mov_b32 s44, s13 +; HAWAII-NEXT: s_mov_b32 s45, s12 +; HAWAII-NEXT: s_mov_b64 s[34:35], s[10:11] +; HAWAII-NEXT: s_mov_b64 s[36:37], s[8:9] +; HAWAII-NEXT: s_mov_b64 s[38:39], s[6:7] +; HAWAII-NEXT: s_mov_b64 s[40:41], s[4:5] +; HAWAII-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; HAWAII-NEXT: s_mov_b64 s[46:47], exec +; HAWAII-NEXT: s_addk_i32 s32, 0x400 +; HAWAII-NEXT: v_writelane_b32 v40, s49, 17 +; HAWAII-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; HAWAII-NEXT: v_readfirstlane_b32 s16, v0 +; HAWAII-NEXT: v_readfirstlane_b32 s17, v1 +; HAWAII-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] +; HAWAII-NEXT: s_and_saveexec_b64 s[48:49], vcc +; HAWAII-NEXT: s_mov_b64 s[4:5], s[40:41] +; HAWAII-NEXT: s_mov_b64 s[6:7], s[38:39] +; HAWAII-NEXT: s_mov_b64 s[8:9], s[36:37] +; HAWAII-NEXT: s_mov_b64 s[10:11], s[34:35] +; HAWAII-NEXT: s_mov_b32 s12, s45 +; HAWAII-NEXT: s_mov_b32 s13, s44 +; HAWAII-NEXT: s_mov_b32 s14, s43 +; HAWAII-NEXT: s_mov_b32 s15, s42 +; HAWAII-NEXT: v_mov_b32_e32 v0, v2 +; HAWAII-NEXT: v_mov_b32_e32 v1, v3 +; HAWAII-NEXT: s_swappc_b64 s[30:31], s[16:17] +; HAWAII-NEXT: v_mov_b32_e32 v4, v0 +; HAWAII-NEXT: ; implicit-def: $vgpr0_vgpr1 +; HAWAII-NEXT: ; implicit-def: $vgpr31 +; HAWAII-NEXT: ; implicit-def: $vgpr2 +; HAWAII-NEXT: ; implicit-def: $vgpr3 +; HAWAII-NEXT: s_xor_b64 exec, exec, s[48:49] +; HAWAII-NEXT: s_cbranch_execnz .LBB18_1 +; HAWAII-NEXT: ; %bb.2: +; HAWAII-NEXT: s_mov_b64 exec, s[46:47] +; HAWAII-NEXT: v_mov_b32_e32 v0, v4 +; HAWAII-NEXT: v_readlane_b32 s49, v40, 17 +; HAWAII-NEXT: v_readlane_b32 s48, v40, 16 +; HAWAII-NEXT: v_readlane_b32 s47, v40, 15 +; HAWAII-NEXT: v_readlane_b32 s46, v40, 14 +; HAWAII-NEXT: v_readlane_b32 s45, v40, 13 +; HAWAII-NEXT: v_readlane_b32 s44, v40, 12 +; HAWAII-NEXT: v_readlane_b32 s43, v40, 11 +; HAWAII-NEXT: v_readlane_b32 s42, v40, 10 +; HAWAII-NEXT: v_readlane_b32 s41, v40, 9 +; HAWAII-NEXT: v_readlane_b32 s40, v40, 8 +; HAWAII-NEXT: v_readlane_b32 s39, v40, 7 +; HAWAII-NEXT: v_readlane_b32 s38, v40, 6 +; HAWAII-NEXT: v_readlane_b32 s37, v40, 5 +; HAWAII-NEXT: v_readlane_b32 s36, v40, 4 +; HAWAII-NEXT: v_readlane_b32 s35, v40, 3 +; HAWAII-NEXT: v_readlane_b32 s34, v40, 2 +; HAWAII-NEXT: v_readlane_b32 s31, v40, 1 +; HAWAII-NEXT: v_readlane_b32 s30, v40, 0 +; HAWAII-NEXT: s_mov_b32 s32, s33 +; HAWAII-NEXT: v_readlane_b32 s4, v40, 18 +; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HAWAII-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HAWAII-NEXT: s_mov_b64 exec, s[6:7] +; HAWAII-NEXT: s_mov_b32 s33, s4 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-NEXT: v_writelane_b32 v40, s16, 18 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s35, 3 +; GFX9-NEXT: v_writelane_b32 v40, s36, 4 +; GFX9-NEXT: v_writelane_b32 v40, s37, 5 +; GFX9-NEXT: v_writelane_b32 v40, s38, 6 +; GFX9-NEXT: v_writelane_b32 v40, s39, 7 +; GFX9-NEXT: v_writelane_b32 v40, s40, 8 +; GFX9-NEXT: v_writelane_b32 v40, s41, 9 +; GFX9-NEXT: v_writelane_b32 v40, s42, 10 +; GFX9-NEXT: v_writelane_b32 v40, s43, 11 +; GFX9-NEXT: v_writelane_b32 v40, s44, 12 +; GFX9-NEXT: v_writelane_b32 v40, s45, 13 +; GFX9-NEXT: v_writelane_b32 v40, s46, 14 +; GFX9-NEXT: v_writelane_b32 v40, s47, 15 +; GFX9-NEXT: v_writelane_b32 v40, s48, 16 +; GFX9-NEXT: s_mov_b32 s42, s15 +; GFX9-NEXT: s_mov_b32 s43, s14 +; GFX9-NEXT: s_mov_b32 s44, s13 +; GFX9-NEXT: s_mov_b32 s45, s12 +; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11] +; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9] +; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7] +; GFX9-NEXT: s_mov_b64 s[40:41], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: s_mov_b64 s[46:47], exec +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s49, 17 +; GFX9-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_readfirstlane_b32 s16, v0 +; GFX9-NEXT: v_readfirstlane_b32 s17, v1 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s45 +; GFX9-NEXT: s_mov_b32 s13, s44 +; GFX9-NEXT: s_mov_b32 s14, s43 +; GFX9-NEXT: s_mov_b32 s15, s42 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_xor_b64 exec, exec, s[48:49] +; GFX9-NEXT: s_cbranch_execnz .LBB18_1 +; GFX9-NEXT: ; %bb.2: +; GFX9-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_readlane_b32 s49, v40, 17 +; GFX9-NEXT: v_readlane_b32 s48, v40, 16 +; GFX9-NEXT: v_readlane_b32 s47, v40, 15 +; GFX9-NEXT: v_readlane_b32 s46, v40, 14 +; GFX9-NEXT: v_readlane_b32 s45, v40, 13 +; GFX9-NEXT: v_readlane_b32 s44, v40, 12 +; GFX9-NEXT: v_readlane_b32 s43, v40, 11 +; GFX9-NEXT: v_readlane_b32 s42, v40, 10 +; GFX9-NEXT: v_readlane_b32 s41, v40, 9 +; GFX9-NEXT: v_readlane_b32 s40, v40, 8 +; GFX9-NEXT: v_readlane_b32 s39, v40, 7 +; GFX9-NEXT: v_readlane_b32 s38, v40, 6 +; GFX9-NEXT: v_readlane_b32 s37, v40, 5 +; GFX9-NEXT: v_readlane_b32 s36, v40, 4 +; GFX9-NEXT: v_readlane_b32 s35, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: v_readlane_b32 s4, v40, 18 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] entry: %add = add i32 %b, %c %ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add) @@ -307,30 +875,30 @@ entry: declare hidden void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) align 16, ptr addrspace(5) byval([2 x i64])) -; GCN-LABEL: {{^}}sibling_call_fastcc_multi_byval: -; GCN-DAG: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]] -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 - -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152 - -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}} - -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:160 -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:164 -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:168 -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:172 -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:28{{$}} - -; GCN: s_setpc_b64 [[TARGET_ADDR]] define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { +; GCN-LABEL: sibling_call_fastcc_multi_byval: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 9 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %alloca0 = alloca [3 x i32], align 16, addrspace(5) %alloca1 = alloca [2 x i64], align 8, addrspace(5) @@ -343,26 +911,55 @@ entry: declare hidden void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([3 x i32]) align 16, [32 x i32], i32) ; Callee has a byval and non-byval stack passed argument -; GCN-LABEL: {{^}}sibling_call_byval_and_stack_passed: -; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 - -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:16 - -; GCN: v_mov_b32_e32 v0, 0 -; GCN: v_mov_b32_e32 v30, 0 - -; GCN: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]] -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 [[TARGET_ADDR]] define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 { +; GCN-LABEL: sibling_call_byval_and_stack_passed: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %alloca = alloca [3 x i32], align 16, addrspace(5) store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca @@ -372,13 +969,14 @@ entry: declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0) -; GCN-LABEL: {{^}}sibling_call_i64_fastcc_i64: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 { +; GCN-LABEL: sibling_call_i64_fastcc_i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a) ret i64 %ret @@ -386,13 +984,14 @@ entry: declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0) -; GCN-LABEL: {{^}}sibling_call_p1i8_fastcc_p1i8: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 { +; GCN-LABEL: sibling_call_p1i8_fastcc_p1i8: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a) ret ptr addrspace(1) %ret @@ -400,13 +999,14 @@ entry: declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0) -; GCN-LABEL: {{^}}sibling_call_i16_fastcc_i16: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 { +; GCN-LABEL: sibling_call_i16_fastcc_i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a) ret i16 %ret @@ -414,13 +1014,14 @@ entry: declare hidden fastcc half @f16_fastcc_f16(half %arg0) -; GCN-LABEL: {{^}}sibling_call_f16_fastcc_f16: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 { +; GCN-LABEL: sibling_call_f16_fastcc_f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc half @f16_fastcc_f16(half %a) ret half %ret @@ -428,13 +1029,14 @@ entry: declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0) -; GCN-LABEL: {{^}}sibling_call_v3i16_fastcc_v3i16: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 { +; GCN-LABEL: sibling_call_v3i16_fastcc_v3i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a) ret <3 x i16> %ret @@ -442,13 +1044,14 @@ entry: declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0) -; GCN-LABEL: {{^}}sibling_call_v4i16_fastcc_v4i16: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 { +; GCN-LABEL: sibling_call_v4i16_fastcc_v4i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a) ret <4 x i16> %ret @@ -456,13 +1059,14 @@ entry: declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0) -; GCN-LABEL: {{^}}sibling_call_v2i64_fastcc_v2i64: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 { +; GCN-LABEL: sibling_call_v2i64_fastcc_v2i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a) ret <2 x i64> %ret