From 8590a0072af2d419e5d9c5ef13cc96cf50daca16 Mon Sep 17 00:00:00 2001 From: easyonaadit Date: Mon, 27 Jan 2025 11:07:51 +0530 Subject: [PATCH 1/4] [NFC][AMDGPU] Autogenerating test cases --- .../test/CodeGen/AMDGPU/callee-frame-setup.ll | 2800 ++++++++++++++--- llvm/test/CodeGen/AMDGPU/nested-calls.ll | 96 +- llvm/test/CodeGen/AMDGPU/sibling-call.ll | 800 +++-- 3 files changed, 3038 insertions(+), 658 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index 6fb071dd42d2f..3241a76d46a1e 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -1,123 +1,177 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FLATSCR %s -; GCN-LABEL: {{^}}callee_no_stack: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @callee_no_stack() #0 { +; GCN-LABEL: callee_no_stack: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ret void } -; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33 -; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_no_fp_elim_all() #1 { +; MUBUF-LABEL: callee_no_stack_no_fp_elim_all: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_no_stack_no_fp_elim_all: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] ret void } -; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_nonleaf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_no_fp_elim_nonleaf() #2 { +; GCN-LABEL: callee_no_stack_no_fp_elim_nonleaf: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ret void } -; GCN-LABEL: {{^}}callee_with_stack: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @callee_with_stack() #0 { +; MUBUF-LABEL: callee_with_stack: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_with_stack: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca ret void } ; Can use free call clobbered register to preserve original FP value. - -; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33 -; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; MUBUF-NEXT: s_addk_i32 s32, 0x200 -; FLATSCR-NEXT: s_add_i32 s32, s32, 8 -; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33{{$}} -; FLATSCR-NEXT: scratch_store_dword off, v0, s33{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_mov_b32 s32, s33 -; FLATSCR-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_all() #1 { +; MUBUF-LABEL: callee_with_stack_no_fp_elim_all: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_addk_i32 s32, 0x200 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_with_stack_no_fp_elim_all: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_add_i32 s32, s32, 8 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca ret void } -; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_non_leaf: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} -; FLATSCR-NEXT: scratch_store_dword off, v0, s32{{$}} -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_non_leaf() #2 { +; MUBUF-LABEL: callee_with_stack_no_fp_elim_non_leaf: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_with_stack_no_fp_elim_non_leaf: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca ret void } -; GCN-LABEL: {{^}}callee_with_stack_and_call: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], 2 -; MUBUF-DAG: s_addk_i32 s32, 0x400{{$}} -; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}} -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, - -; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} -; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33{{$}} - -; GCN: s_swappc_b64 - -; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]] -; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]] - -; MUBUF: s_mov_b32 s32, s33{{$}} -; FLATSCR: s_mov_b32 s32, s33{{$}} -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], 2 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_waitcnt vmcnt(0) - -; GCN-NEXT: s_setpc_b64 s[30:31] define void @callee_with_stack_and_call() #0 { +; MUBUF-LABEL: callee_with_stack_and_call: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s16, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[18:19], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[18:19] +; MUBUF-NEXT: v_writelane_b32 v40, s16, 2 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_getpc_b64 s[16:17] +; MUBUF-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_with_stack_and_call: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void @external_void_func_void() @@ -130,36 +184,60 @@ define void @callee_with_stack_and_call() #0 { ; There is stack usage only because of the need to evict a VGPR for ; spilling CSR SGPRs. -; GCN-LABEL: {{^}}callee_no_stack_with_call: -; GCN: s_waitcnt -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; MUBUF-DAG: s_addk_i32 s32, 0x400 -; FLATSCR-DAG: s_add_i32 s32, s32, 16 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], [[FP_SPILL_LANE:[0-9]+]] - -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 -; GCN: s_swappc_b64 - -; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0 -; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1 - -; MUBUF: s_mov_b32 s32, s33 -; FLATSCR: s_mov_b32 s32, s33 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], [[FP_SPILL_LANE]] -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] define void @callee_no_stack_with_call() #0 { +; MUBUF-LABEL: callee_no_stack_with_call: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s16, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[18:19], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[18:19] +; MUBUF-NEXT: v_writelane_b32 v40, s16, 2 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: s_getpc_b64 s[16:17] +; MUBUF-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_no_stack_with_call: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_void() ret void } @@ -168,26 +246,306 @@ declare hidden void @external_void_func_void() #0 ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and ; restored. No FP is required. -; -; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: -; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR]], s -; GCN: v_writelane_b32 [[CSR_VGPR]], s - -; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] -; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] - -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { +; MUBUF-LABEL: callee_func_sgpr_spill_no_calls: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v40, s36, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s37, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s38, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s39, 3 +; MUBUF-NEXT: v_writelane_b32 v40, s40, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s41, 5 +; MUBUF-NEXT: v_writelane_b32 v40, s42, 6 +; MUBUF-NEXT: v_writelane_b32 v40, s43, 7 +; MUBUF-NEXT: v_writelane_b32 v40, s44, 8 +; MUBUF-NEXT: v_writelane_b32 v40, s45, 9 +; MUBUF-NEXT: v_writelane_b32 v40, s46, 10 +; MUBUF-NEXT: v_writelane_b32 v40, s47, 11 +; MUBUF-NEXT: v_writelane_b32 v40, s48, 12 +; MUBUF-NEXT: v_writelane_b32 v40, s49, 13 +; MUBUF-NEXT: v_writelane_b32 v40, s50, 14 +; MUBUF-NEXT: v_writelane_b32 v40, s51, 15 +; MUBUF-NEXT: v_writelane_b32 v40, s52, 16 +; MUBUF-NEXT: v_writelane_b32 v40, s53, 17 +; MUBUF-NEXT: v_writelane_b32 v40, s54, 18 +; MUBUF-NEXT: v_writelane_b32 v40, s55, 19 +; MUBUF-NEXT: v_writelane_b32 v40, s56, 20 +; MUBUF-NEXT: v_writelane_b32 v40, s57, 21 +; MUBUF-NEXT: v_writelane_b32 v40, s58, 22 +; MUBUF-NEXT: v_writelane_b32 v40, s59, 23 +; MUBUF-NEXT: v_writelane_b32 v40, s60, 24 +; MUBUF-NEXT: v_writelane_b32 v40, s61, 25 +; MUBUF-NEXT: v_writelane_b32 v40, s62, 26 +; MUBUF-NEXT: v_writelane_b32 v40, s63, 27 +; MUBUF-NEXT: v_writelane_b32 v40, s64, 28 +; MUBUF-NEXT: v_writelane_b32 v40, s65, 29 +; MUBUF-NEXT: v_writelane_b32 v40, s66, 30 +; MUBUF-NEXT: v_writelane_b32 v40, s67, 31 +; MUBUF-NEXT: v_writelane_b32 v40, s68, 32 +; MUBUF-NEXT: v_writelane_b32 v40, s69, 33 +; MUBUF-NEXT: v_writelane_b32 v40, s70, 34 +; MUBUF-NEXT: v_writelane_b32 v40, s71, 35 +; MUBUF-NEXT: v_writelane_b32 v40, s72, 36 +; MUBUF-NEXT: v_writelane_b32 v40, s73, 37 +; MUBUF-NEXT: v_writelane_b32 v40, s74, 38 +; MUBUF-NEXT: v_writelane_b32 v40, s75, 39 +; MUBUF-NEXT: v_writelane_b32 v40, s76, 40 +; MUBUF-NEXT: v_writelane_b32 v40, s77, 41 +; MUBUF-NEXT: v_writelane_b32 v40, s78, 42 +; MUBUF-NEXT: v_writelane_b32 v40, s79, 43 +; MUBUF-NEXT: v_writelane_b32 v40, s80, 44 +; MUBUF-NEXT: v_writelane_b32 v40, s81, 45 +; MUBUF-NEXT: v_writelane_b32 v40, s82, 46 +; MUBUF-NEXT: v_writelane_b32 v40, s83, 47 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[68:83] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[52:67] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[36:51] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[4:19] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[20:27] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s[28:29] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[68:83] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[52:67] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[36:51] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[20:27] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[28:29] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s[4:19] +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s83, v40, 47 +; MUBUF-NEXT: v_readlane_b32 s82, v40, 46 +; MUBUF-NEXT: v_readlane_b32 s81, v40, 45 +; MUBUF-NEXT: v_readlane_b32 s80, v40, 44 +; MUBUF-NEXT: v_readlane_b32 s79, v40, 43 +; MUBUF-NEXT: v_readlane_b32 s78, v40, 42 +; MUBUF-NEXT: v_readlane_b32 s77, v40, 41 +; MUBUF-NEXT: v_readlane_b32 s76, v40, 40 +; MUBUF-NEXT: v_readlane_b32 s75, v40, 39 +; MUBUF-NEXT: v_readlane_b32 s74, v40, 38 +; MUBUF-NEXT: v_readlane_b32 s73, v40, 37 +; MUBUF-NEXT: v_readlane_b32 s72, v40, 36 +; MUBUF-NEXT: v_readlane_b32 s71, v40, 35 +; MUBUF-NEXT: v_readlane_b32 s70, v40, 34 +; MUBUF-NEXT: v_readlane_b32 s69, v40, 33 +; MUBUF-NEXT: v_readlane_b32 s68, v40, 32 +; MUBUF-NEXT: v_readlane_b32 s67, v40, 31 +; MUBUF-NEXT: v_readlane_b32 s66, v40, 30 +; MUBUF-NEXT: v_readlane_b32 s65, v40, 29 +; MUBUF-NEXT: v_readlane_b32 s64, v40, 28 +; MUBUF-NEXT: v_readlane_b32 s63, v40, 27 +; MUBUF-NEXT: v_readlane_b32 s62, v40, 26 +; MUBUF-NEXT: v_readlane_b32 s61, v40, 25 +; MUBUF-NEXT: v_readlane_b32 s60, v40, 24 +; MUBUF-NEXT: v_readlane_b32 s59, v40, 23 +; MUBUF-NEXT: v_readlane_b32 s58, v40, 22 +; MUBUF-NEXT: v_readlane_b32 s57, v40, 21 +; MUBUF-NEXT: v_readlane_b32 s56, v40, 20 +; MUBUF-NEXT: v_readlane_b32 s55, v40, 19 +; MUBUF-NEXT: v_readlane_b32 s54, v40, 18 +; MUBUF-NEXT: v_readlane_b32 s53, v40, 17 +; MUBUF-NEXT: v_readlane_b32 s52, v40, 16 +; MUBUF-NEXT: v_readlane_b32 s51, v40, 15 +; MUBUF-NEXT: v_readlane_b32 s50, v40, 14 +; MUBUF-NEXT: v_readlane_b32 s49, v40, 13 +; MUBUF-NEXT: v_readlane_b32 s48, v40, 12 +; MUBUF-NEXT: v_readlane_b32 s47, v40, 11 +; MUBUF-NEXT: v_readlane_b32 s46, v40, 10 +; MUBUF-NEXT: v_readlane_b32 s45, v40, 9 +; MUBUF-NEXT: v_readlane_b32 s44, v40, 8 +; MUBUF-NEXT: v_readlane_b32 s43, v40, 7 +; MUBUF-NEXT: v_readlane_b32 s42, v40, 6 +; MUBUF-NEXT: v_readlane_b32 s41, v40, 5 +; MUBUF-NEXT: v_readlane_b32 s40, v40, 4 +; MUBUF-NEXT: v_readlane_b32 s39, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s38, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s37, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s36, v40, 0 +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_func_sgpr_spill_no_calls: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3 +; FLATSCR-NEXT: v_writelane_b32 v40, s36, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s37, 5 +; FLATSCR-NEXT: v_writelane_b32 v40, s38, 6 +; FLATSCR-NEXT: v_writelane_b32 v40, s39, 7 +; FLATSCR-NEXT: v_writelane_b32 v40, s40, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s41, 9 +; FLATSCR-NEXT: v_writelane_b32 v40, s42, 10 +; FLATSCR-NEXT: v_writelane_b32 v40, s43, 11 +; FLATSCR-NEXT: v_writelane_b32 v40, s44, 12 +; FLATSCR-NEXT: v_writelane_b32 v40, s45, 13 +; FLATSCR-NEXT: v_writelane_b32 v40, s46, 14 +; FLATSCR-NEXT: v_writelane_b32 v40, s47, 15 +; FLATSCR-NEXT: v_writelane_b32 v40, s48, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s49, 17 +; FLATSCR-NEXT: v_writelane_b32 v40, s50, 18 +; FLATSCR-NEXT: v_writelane_b32 v40, s51, 19 +; FLATSCR-NEXT: v_writelane_b32 v40, s52, 20 +; FLATSCR-NEXT: v_writelane_b32 v40, s53, 21 +; FLATSCR-NEXT: v_writelane_b32 v40, s54, 22 +; FLATSCR-NEXT: v_writelane_b32 v40, s55, 23 +; FLATSCR-NEXT: v_writelane_b32 v40, s56, 24 +; FLATSCR-NEXT: v_writelane_b32 v40, s57, 25 +; FLATSCR-NEXT: v_writelane_b32 v40, s58, 26 +; FLATSCR-NEXT: v_writelane_b32 v40, s59, 27 +; FLATSCR-NEXT: v_writelane_b32 v40, s60, 28 +; FLATSCR-NEXT: v_writelane_b32 v40, s61, 29 +; FLATSCR-NEXT: v_writelane_b32 v40, s62, 30 +; FLATSCR-NEXT: v_writelane_b32 v40, s63, 31 +; FLATSCR-NEXT: v_writelane_b32 v40, s64, 32 +; FLATSCR-NEXT: v_writelane_b32 v40, s65, 33 +; FLATSCR-NEXT: v_writelane_b32 v40, s66, 34 +; FLATSCR-NEXT: v_writelane_b32 v40, s67, 35 +; FLATSCR-NEXT: v_writelane_b32 v40, s68, 36 +; FLATSCR-NEXT: v_writelane_b32 v40, s69, 37 +; FLATSCR-NEXT: v_writelane_b32 v40, s70, 38 +; FLATSCR-NEXT: v_writelane_b32 v40, s71, 39 +; FLATSCR-NEXT: v_writelane_b32 v40, s72, 40 +; FLATSCR-NEXT: v_writelane_b32 v40, s73, 41 +; FLATSCR-NEXT: v_writelane_b32 v40, s74, 42 +; FLATSCR-NEXT: v_writelane_b32 v40, s75, 43 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[52:67] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[36:51] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[16:31] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[0:15] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[68:75] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s[34:35] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[52:67] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[36:51] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[16:31] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[68:75] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[34:35] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s[0:15] +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s75, v40, 43 +; FLATSCR-NEXT: v_readlane_b32 s74, v40, 42 +; FLATSCR-NEXT: v_readlane_b32 s73, v40, 41 +; FLATSCR-NEXT: v_readlane_b32 s72, v40, 40 +; FLATSCR-NEXT: v_readlane_b32 s71, v40, 39 +; FLATSCR-NEXT: v_readlane_b32 s70, v40, 38 +; FLATSCR-NEXT: v_readlane_b32 s69, v40, 37 +; FLATSCR-NEXT: v_readlane_b32 s68, v40, 36 +; FLATSCR-NEXT: v_readlane_b32 s67, v40, 35 +; FLATSCR-NEXT: v_readlane_b32 s66, v40, 34 +; FLATSCR-NEXT: v_readlane_b32 s65, v40, 33 +; FLATSCR-NEXT: v_readlane_b32 s64, v40, 32 +; FLATSCR-NEXT: v_readlane_b32 s63, v40, 31 +; FLATSCR-NEXT: v_readlane_b32 s62, v40, 30 +; FLATSCR-NEXT: v_readlane_b32 s61, v40, 29 +; FLATSCR-NEXT: v_readlane_b32 s60, v40, 28 +; FLATSCR-NEXT: v_readlane_b32 s59, v40, 27 +; FLATSCR-NEXT: v_readlane_b32 s58, v40, 26 +; FLATSCR-NEXT: v_readlane_b32 s57, v40, 25 +; FLATSCR-NEXT: v_readlane_b32 s56, v40, 24 +; FLATSCR-NEXT: v_readlane_b32 s55, v40, 23 +; FLATSCR-NEXT: v_readlane_b32 s54, v40, 22 +; FLATSCR-NEXT: v_readlane_b32 s53, v40, 21 +; FLATSCR-NEXT: v_readlane_b32 s52, v40, 20 +; FLATSCR-NEXT: v_readlane_b32 s51, v40, 19 +; FLATSCR-NEXT: v_readlane_b32 s50, v40, 18 +; FLATSCR-NEXT: v_readlane_b32 s49, v40, 17 +; FLATSCR-NEXT: v_readlane_b32 s48, v40, 16 +; FLATSCR-NEXT: v_readlane_b32 s47, v40, 15 +; FLATSCR-NEXT: v_readlane_b32 s46, v40, 14 +; FLATSCR-NEXT: v_readlane_b32 s45, v40, 13 +; FLATSCR-NEXT: v_readlane_b32 s44, v40, 12 +; FLATSCR-NEXT: v_readlane_b32 s43, v40, 11 +; FLATSCR-NEXT: v_readlane_b32 s42, v40, 10 +; FLATSCR-NEXT: v_readlane_b32 s41, v40, 9 +; FLATSCR-NEXT: v_readlane_b32 s40, v40, 8 +; FLATSCR-NEXT: v_readlane_b32 s39, v40, 7 +; FLATSCR-NEXT: v_readlane_b32 s38, v40, 6 +; FLATSCR-NEXT: v_readlane_b32 s37, v40, 5 +; FLATSCR-NEXT: v_readlane_b32 s36, v40, 4 +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0 call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0 @@ -212,55 +570,83 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; Has no spilled CSR VGPRs used for SGPR spilling, so no need to ; enable all lanes and restore. - -; GCN-LABEL: {{^}}spill_only_csr_sgpr: -; GCN: s_waitcnt -; GCN-NEXT: s_xor_saveexec_b64 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, -; GCN-NEXT: v_writelane_b32 v0, s42, 0 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; clobber s42 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s42, v0, 0 -; GCN-NEXT: s_xor_saveexec_b64 -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @spill_only_csr_sgpr() { +; MUBUF-LABEL: spill_only_csr_sgpr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s42, 0 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber s42 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s42, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: spill_only_csr_sgpr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s42, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber s42 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s42, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber s42", "~{s42}"() ret void } ; TODO: Can the SP inc/deec be remvoed? -; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_csr_vgpr: -; GCN: s_waitcnt -; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; MUBUF-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; FLATSCR-DAG: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 -; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:4 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; clobber v41 -; GCN-NEXT: ;;#ASMEND - -; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload -; MUBUF: s_addk_i32 s32, 0x300 -; MUBUF-NEXT: s_mov_b32 s32, s33 -; MUBUF-NEXT: s_mov_b32 s33, s4 -; FLATSCR: s_add_i32 s32, s32, 12 -; FLATSCR-NEXT: s_mov_b32 s32, s33 -; FLATSCR-NEXT: s_mov_b32 s33, s0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { +; MUBUF-LABEL: callee_with_stack_no_fp_elim_csr_vgpr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber v41 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_with_stack_no_fp_elim_csr_vgpr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber v41 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_add_i32 s32, s32, 12 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void asm sideeffect "; clobber v41", "~{v41}"() @@ -268,32 +654,312 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { } ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. -; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr: -; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 -; GCN: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; GCN: v_writelane_b32 v1 -; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4 -; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4 -; GCN: ;;#ASMSTART - -; MUBUF: s_mov_b32 s32, s33 -; FLATSCR: s_mov_b32 s32, s33 - -; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, [[TMP_SGPR]] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @last_lane_vgpr_for_fp_csr() #1 { +; MUBUF-LABEL: last_lane_vgpr_for_fp_csr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v1, s40, 0 +; MUBUF-NEXT: v_writelane_b32 v1, s41, 1 +; MUBUF-NEXT: v_writelane_b32 v1, s42, 2 +; MUBUF-NEXT: v_writelane_b32 v1, s43, 3 +; MUBUF-NEXT: v_writelane_b32 v1, s44, 4 +; MUBUF-NEXT: v_writelane_b32 v1, s45, 5 +; MUBUF-NEXT: v_writelane_b32 v1, s46, 6 +; MUBUF-NEXT: v_writelane_b32 v1, s47, 7 +; MUBUF-NEXT: v_writelane_b32 v1, s48, 8 +; MUBUF-NEXT: v_writelane_b32 v1, s49, 9 +; MUBUF-NEXT: v_writelane_b32 v1, s50, 10 +; MUBUF-NEXT: v_writelane_b32 v1, s51, 11 +; MUBUF-NEXT: v_writelane_b32 v1, s52, 12 +; MUBUF-NEXT: v_writelane_b32 v1, s53, 13 +; MUBUF-NEXT: v_writelane_b32 v1, s54, 14 +; MUBUF-NEXT: v_writelane_b32 v1, s55, 15 +; MUBUF-NEXT: v_writelane_b32 v1, s56, 16 +; MUBUF-NEXT: v_writelane_b32 v1, s57, 17 +; MUBUF-NEXT: v_writelane_b32 v1, s58, 18 +; MUBUF-NEXT: v_writelane_b32 v1, s59, 19 +; MUBUF-NEXT: v_writelane_b32 v1, s60, 20 +; MUBUF-NEXT: v_writelane_b32 v1, s61, 21 +; MUBUF-NEXT: v_writelane_b32 v1, s62, 22 +; MUBUF-NEXT: v_writelane_b32 v1, s63, 23 +; MUBUF-NEXT: v_writelane_b32 v1, s64, 24 +; MUBUF-NEXT: v_writelane_b32 v1, s65, 25 +; MUBUF-NEXT: v_writelane_b32 v1, s66, 26 +; MUBUF-NEXT: v_writelane_b32 v1, s67, 27 +; MUBUF-NEXT: v_writelane_b32 v1, s68, 28 +; MUBUF-NEXT: v_writelane_b32 v1, s69, 29 +; MUBUF-NEXT: v_writelane_b32 v1, s70, 30 +; MUBUF-NEXT: v_writelane_b32 v1, s71, 31 +; MUBUF-NEXT: v_writelane_b32 v1, s72, 32 +; MUBUF-NEXT: v_writelane_b32 v1, s73, 33 +; MUBUF-NEXT: v_writelane_b32 v1, s74, 34 +; MUBUF-NEXT: v_writelane_b32 v1, s75, 35 +; MUBUF-NEXT: v_writelane_b32 v1, s76, 36 +; MUBUF-NEXT: v_writelane_b32 v1, s77, 37 +; MUBUF-NEXT: v_writelane_b32 v1, s78, 38 +; MUBUF-NEXT: v_writelane_b32 v1, s79, 39 +; MUBUF-NEXT: v_writelane_b32 v1, s80, 40 +; MUBUF-NEXT: v_writelane_b32 v1, s81, 41 +; MUBUF-NEXT: v_writelane_b32 v1, s82, 42 +; MUBUF-NEXT: v_writelane_b32 v1, s83, 43 +; MUBUF-NEXT: v_writelane_b32 v1, s84, 44 +; MUBUF-NEXT: v_writelane_b32 v1, s85, 45 +; MUBUF-NEXT: v_writelane_b32 v1, s86, 46 +; MUBUF-NEXT: v_writelane_b32 v1, s87, 47 +; MUBUF-NEXT: v_writelane_b32 v1, s88, 48 +; MUBUF-NEXT: v_writelane_b32 v1, s89, 49 +; MUBUF-NEXT: v_writelane_b32 v1, s90, 50 +; MUBUF-NEXT: v_writelane_b32 v1, s91, 51 +; MUBUF-NEXT: v_writelane_b32 v1, s92, 52 +; MUBUF-NEXT: v_writelane_b32 v1, s93, 53 +; MUBUF-NEXT: v_writelane_b32 v1, s94, 54 +; MUBUF-NEXT: v_writelane_b32 v1, s95, 55 +; MUBUF-NEXT: v_writelane_b32 v1, s96, 56 +; MUBUF-NEXT: v_writelane_b32 v1, s97, 57 +; MUBUF-NEXT: v_writelane_b32 v1, s98, 58 +; MUBUF-NEXT: v_writelane_b32 v1, s99, 59 +; MUBUF-NEXT: v_writelane_b32 v1, s100, 60 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v1, s101, 61 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber v41 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_writelane_b32 v1, s102, 62 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_readlane_b32 s102, v1, 62 +; MUBUF-NEXT: v_readlane_b32 s101, v1, 61 +; MUBUF-NEXT: v_readlane_b32 s100, v1, 60 +; MUBUF-NEXT: v_readlane_b32 s99, v1, 59 +; MUBUF-NEXT: v_readlane_b32 s98, v1, 58 +; MUBUF-NEXT: v_readlane_b32 s97, v1, 57 +; MUBUF-NEXT: v_readlane_b32 s96, v1, 56 +; MUBUF-NEXT: v_readlane_b32 s95, v1, 55 +; MUBUF-NEXT: v_readlane_b32 s94, v1, 54 +; MUBUF-NEXT: v_readlane_b32 s93, v1, 53 +; MUBUF-NEXT: v_readlane_b32 s92, v1, 52 +; MUBUF-NEXT: v_readlane_b32 s91, v1, 51 +; MUBUF-NEXT: v_readlane_b32 s90, v1, 50 +; MUBUF-NEXT: v_readlane_b32 s89, v1, 49 +; MUBUF-NEXT: v_readlane_b32 s88, v1, 48 +; MUBUF-NEXT: v_readlane_b32 s87, v1, 47 +; MUBUF-NEXT: v_readlane_b32 s86, v1, 46 +; MUBUF-NEXT: v_readlane_b32 s85, v1, 45 +; MUBUF-NEXT: v_readlane_b32 s84, v1, 44 +; MUBUF-NEXT: v_readlane_b32 s83, v1, 43 +; MUBUF-NEXT: v_readlane_b32 s82, v1, 42 +; MUBUF-NEXT: v_readlane_b32 s81, v1, 41 +; MUBUF-NEXT: v_readlane_b32 s80, v1, 40 +; MUBUF-NEXT: v_readlane_b32 s79, v1, 39 +; MUBUF-NEXT: v_readlane_b32 s78, v1, 38 +; MUBUF-NEXT: v_readlane_b32 s77, v1, 37 +; MUBUF-NEXT: v_readlane_b32 s76, v1, 36 +; MUBUF-NEXT: v_readlane_b32 s75, v1, 35 +; MUBUF-NEXT: v_readlane_b32 s74, v1, 34 +; MUBUF-NEXT: v_readlane_b32 s73, v1, 33 +; MUBUF-NEXT: v_readlane_b32 s72, v1, 32 +; MUBUF-NEXT: v_readlane_b32 s71, v1, 31 +; MUBUF-NEXT: v_readlane_b32 s70, v1, 30 +; MUBUF-NEXT: v_readlane_b32 s69, v1, 29 +; MUBUF-NEXT: v_readlane_b32 s68, v1, 28 +; MUBUF-NEXT: v_readlane_b32 s67, v1, 27 +; MUBUF-NEXT: v_readlane_b32 s66, v1, 26 +; MUBUF-NEXT: v_readlane_b32 s65, v1, 25 +; MUBUF-NEXT: v_readlane_b32 s64, v1, 24 +; MUBUF-NEXT: v_readlane_b32 s63, v1, 23 +; MUBUF-NEXT: v_readlane_b32 s62, v1, 22 +; MUBUF-NEXT: v_readlane_b32 s61, v1, 21 +; MUBUF-NEXT: v_readlane_b32 s60, v1, 20 +; MUBUF-NEXT: v_readlane_b32 s59, v1, 19 +; MUBUF-NEXT: v_readlane_b32 s58, v1, 18 +; MUBUF-NEXT: v_readlane_b32 s57, v1, 17 +; MUBUF-NEXT: v_readlane_b32 s56, v1, 16 +; MUBUF-NEXT: v_readlane_b32 s55, v1, 15 +; MUBUF-NEXT: v_readlane_b32 s54, v1, 14 +; MUBUF-NEXT: v_readlane_b32 s53, v1, 13 +; MUBUF-NEXT: v_readlane_b32 s52, v1, 12 +; MUBUF-NEXT: v_readlane_b32 s51, v1, 11 +; MUBUF-NEXT: v_readlane_b32 s50, v1, 10 +; MUBUF-NEXT: v_readlane_b32 s49, v1, 9 +; MUBUF-NEXT: v_readlane_b32 s48, v1, 8 +; MUBUF-NEXT: v_readlane_b32 s47, v1, 7 +; MUBUF-NEXT: v_readlane_b32 s46, v1, 6 +; MUBUF-NEXT: v_readlane_b32 s45, v1, 5 +; MUBUF-NEXT: v_readlane_b32 s44, v1, 4 +; MUBUF-NEXT: v_readlane_b32 s43, v1, 3 +; MUBUF-NEXT: v_readlane_b32 s42, v1, 2 +; MUBUF-NEXT: v_readlane_b32 s41, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s40, v1, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: last_lane_vgpr_for_fp_csr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v1, s40, 0 +; FLATSCR-NEXT: v_writelane_b32 v1, s41, 1 +; FLATSCR-NEXT: v_writelane_b32 v1, s42, 2 +; FLATSCR-NEXT: v_writelane_b32 v1, s43, 3 +; FLATSCR-NEXT: v_writelane_b32 v1, s44, 4 +; FLATSCR-NEXT: v_writelane_b32 v1, s45, 5 +; FLATSCR-NEXT: v_writelane_b32 v1, s46, 6 +; FLATSCR-NEXT: v_writelane_b32 v1, s47, 7 +; FLATSCR-NEXT: v_writelane_b32 v1, s48, 8 +; FLATSCR-NEXT: v_writelane_b32 v1, s49, 9 +; FLATSCR-NEXT: v_writelane_b32 v1, s50, 10 +; FLATSCR-NEXT: v_writelane_b32 v1, s51, 11 +; FLATSCR-NEXT: v_writelane_b32 v1, s52, 12 +; FLATSCR-NEXT: v_writelane_b32 v1, s53, 13 +; FLATSCR-NEXT: v_writelane_b32 v1, s54, 14 +; FLATSCR-NEXT: v_writelane_b32 v1, s55, 15 +; FLATSCR-NEXT: v_writelane_b32 v1, s56, 16 +; FLATSCR-NEXT: v_writelane_b32 v1, s57, 17 +; FLATSCR-NEXT: v_writelane_b32 v1, s58, 18 +; FLATSCR-NEXT: v_writelane_b32 v1, s59, 19 +; FLATSCR-NEXT: v_writelane_b32 v1, s60, 20 +; FLATSCR-NEXT: v_writelane_b32 v1, s61, 21 +; FLATSCR-NEXT: v_writelane_b32 v1, s62, 22 +; FLATSCR-NEXT: v_writelane_b32 v1, s63, 23 +; FLATSCR-NEXT: v_writelane_b32 v1, s64, 24 +; FLATSCR-NEXT: v_writelane_b32 v1, s65, 25 +; FLATSCR-NEXT: v_writelane_b32 v1, s66, 26 +; FLATSCR-NEXT: v_writelane_b32 v1, s67, 27 +; FLATSCR-NEXT: v_writelane_b32 v1, s68, 28 +; FLATSCR-NEXT: v_writelane_b32 v1, s69, 29 +; FLATSCR-NEXT: v_writelane_b32 v1, s70, 30 +; FLATSCR-NEXT: v_writelane_b32 v1, s71, 31 +; FLATSCR-NEXT: v_writelane_b32 v1, s72, 32 +; FLATSCR-NEXT: v_writelane_b32 v1, s73, 33 +; FLATSCR-NEXT: v_writelane_b32 v1, s74, 34 +; FLATSCR-NEXT: v_writelane_b32 v1, s75, 35 +; FLATSCR-NEXT: v_writelane_b32 v1, s76, 36 +; FLATSCR-NEXT: v_writelane_b32 v1, s77, 37 +; FLATSCR-NEXT: v_writelane_b32 v1, s78, 38 +; FLATSCR-NEXT: v_writelane_b32 v1, s79, 39 +; FLATSCR-NEXT: v_writelane_b32 v1, s80, 40 +; FLATSCR-NEXT: v_writelane_b32 v1, s81, 41 +; FLATSCR-NEXT: v_writelane_b32 v1, s82, 42 +; FLATSCR-NEXT: v_writelane_b32 v1, s83, 43 +; FLATSCR-NEXT: v_writelane_b32 v1, s84, 44 +; FLATSCR-NEXT: v_writelane_b32 v1, s85, 45 +; FLATSCR-NEXT: v_writelane_b32 v1, s86, 46 +; FLATSCR-NEXT: v_writelane_b32 v1, s87, 47 +; FLATSCR-NEXT: v_writelane_b32 v1, s88, 48 +; FLATSCR-NEXT: v_writelane_b32 v1, s89, 49 +; FLATSCR-NEXT: v_writelane_b32 v1, s90, 50 +; FLATSCR-NEXT: v_writelane_b32 v1, s91, 51 +; FLATSCR-NEXT: v_writelane_b32 v1, s92, 52 +; FLATSCR-NEXT: v_writelane_b32 v1, s93, 53 +; FLATSCR-NEXT: v_writelane_b32 v1, s94, 54 +; FLATSCR-NEXT: v_writelane_b32 v1, s95, 55 +; FLATSCR-NEXT: v_writelane_b32 v1, s96, 56 +; FLATSCR-NEXT: v_writelane_b32 v1, s97, 57 +; FLATSCR-NEXT: v_writelane_b32 v1, s98, 58 +; FLATSCR-NEXT: v_writelane_b32 v1, s99, 59 +; FLATSCR-NEXT: v_writelane_b32 v1, s100, 60 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: v_writelane_b32 v1, s101, 61 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber v41 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_writelane_b32 v1, s102, 62 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_readlane_b32 s102, v1, 62 +; FLATSCR-NEXT: v_readlane_b32 s101, v1, 61 +; FLATSCR-NEXT: v_readlane_b32 s100, v1, 60 +; FLATSCR-NEXT: v_readlane_b32 s99, v1, 59 +; FLATSCR-NEXT: v_readlane_b32 s98, v1, 58 +; FLATSCR-NEXT: v_readlane_b32 s97, v1, 57 +; FLATSCR-NEXT: v_readlane_b32 s96, v1, 56 +; FLATSCR-NEXT: v_readlane_b32 s95, v1, 55 +; FLATSCR-NEXT: v_readlane_b32 s94, v1, 54 +; FLATSCR-NEXT: v_readlane_b32 s93, v1, 53 +; FLATSCR-NEXT: v_readlane_b32 s92, v1, 52 +; FLATSCR-NEXT: v_readlane_b32 s91, v1, 51 +; FLATSCR-NEXT: v_readlane_b32 s90, v1, 50 +; FLATSCR-NEXT: v_readlane_b32 s89, v1, 49 +; FLATSCR-NEXT: v_readlane_b32 s88, v1, 48 +; FLATSCR-NEXT: v_readlane_b32 s87, v1, 47 +; FLATSCR-NEXT: v_readlane_b32 s86, v1, 46 +; FLATSCR-NEXT: v_readlane_b32 s85, v1, 45 +; FLATSCR-NEXT: v_readlane_b32 s84, v1, 44 +; FLATSCR-NEXT: v_readlane_b32 s83, v1, 43 +; FLATSCR-NEXT: v_readlane_b32 s82, v1, 42 +; FLATSCR-NEXT: v_readlane_b32 s81, v1, 41 +; FLATSCR-NEXT: v_readlane_b32 s80, v1, 40 +; FLATSCR-NEXT: v_readlane_b32 s79, v1, 39 +; FLATSCR-NEXT: v_readlane_b32 s78, v1, 38 +; FLATSCR-NEXT: v_readlane_b32 s77, v1, 37 +; FLATSCR-NEXT: v_readlane_b32 s76, v1, 36 +; FLATSCR-NEXT: v_readlane_b32 s75, v1, 35 +; FLATSCR-NEXT: v_readlane_b32 s74, v1, 34 +; FLATSCR-NEXT: v_readlane_b32 s73, v1, 33 +; FLATSCR-NEXT: v_readlane_b32 s72, v1, 32 +; FLATSCR-NEXT: v_readlane_b32 s71, v1, 31 +; FLATSCR-NEXT: v_readlane_b32 s70, v1, 30 +; FLATSCR-NEXT: v_readlane_b32 s69, v1, 29 +; FLATSCR-NEXT: v_readlane_b32 s68, v1, 28 +; FLATSCR-NEXT: v_readlane_b32 s67, v1, 27 +; FLATSCR-NEXT: v_readlane_b32 s66, v1, 26 +; FLATSCR-NEXT: v_readlane_b32 s65, v1, 25 +; FLATSCR-NEXT: v_readlane_b32 s64, v1, 24 +; FLATSCR-NEXT: v_readlane_b32 s63, v1, 23 +; FLATSCR-NEXT: v_readlane_b32 s62, v1, 22 +; FLATSCR-NEXT: v_readlane_b32 s61, v1, 21 +; FLATSCR-NEXT: v_readlane_b32 s60, v1, 20 +; FLATSCR-NEXT: v_readlane_b32 s59, v1, 19 +; FLATSCR-NEXT: v_readlane_b32 s58, v1, 18 +; FLATSCR-NEXT: v_readlane_b32 s57, v1, 17 +; FLATSCR-NEXT: v_readlane_b32 s56, v1, 16 +; FLATSCR-NEXT: v_readlane_b32 s55, v1, 15 +; FLATSCR-NEXT: v_readlane_b32 s54, v1, 14 +; FLATSCR-NEXT: v_readlane_b32 s53, v1, 13 +; FLATSCR-NEXT: v_readlane_b32 s52, v1, 12 +; FLATSCR-NEXT: v_readlane_b32 s51, v1, 11 +; FLATSCR-NEXT: v_readlane_b32 s50, v1, 10 +; FLATSCR-NEXT: v_readlane_b32 s49, v1, 9 +; FLATSCR-NEXT: v_readlane_b32 s48, v1, 8 +; FLATSCR-NEXT: v_readlane_b32 s47, v1, 7 +; FLATSCR-NEXT: v_readlane_b32 s46, v1, 6 +; FLATSCR-NEXT: v_readlane_b32 s45, v1, 5 +; FLATSCR-NEXT: v_readlane_b32 s44, v1, 4 +; FLATSCR-NEXT: v_readlane_b32 s43, v1, 3 +; FLATSCR-NEXT: v_readlane_b32 s42, v1, 2 +; FLATSCR-NEXT: v_readlane_b32 s41, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s40, v1, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void asm sideeffect "; clobber v41", "~{v41}"() @@ -310,37 +976,316 @@ define void @last_lane_vgpr_for_fp_csr() #1 { } ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. -; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr: -; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-COUNT-61: v_writelane_b32 v1, -; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill -; GCN: v_writelane_b32 v1, -; MUBUF: buffer_store_dword -; FLATSCR: scratch_store_dword -; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1, -; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload -; MUBUF: s_addk_i32 s32, 0x400 -; FLATSCR: s_add_i32 s32, s32, 16 -; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 -; MUBUF-NEXT: s_mov_b32 s32, s33 -; FLATSCR-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @no_new_vgpr_for_fp_csr() #1 { +; MUBUF-LABEL: no_new_vgpr_for_fp_csr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v1, s39, 0 +; MUBUF-NEXT: v_writelane_b32 v1, s40, 1 +; MUBUF-NEXT: v_writelane_b32 v1, s41, 2 +; MUBUF-NEXT: v_writelane_b32 v1, s42, 3 +; MUBUF-NEXT: v_writelane_b32 v1, s43, 4 +; MUBUF-NEXT: v_writelane_b32 v1, s44, 5 +; MUBUF-NEXT: v_writelane_b32 v1, s45, 6 +; MUBUF-NEXT: v_writelane_b32 v1, s46, 7 +; MUBUF-NEXT: v_writelane_b32 v1, s47, 8 +; MUBUF-NEXT: v_writelane_b32 v1, s48, 9 +; MUBUF-NEXT: v_writelane_b32 v1, s49, 10 +; MUBUF-NEXT: v_writelane_b32 v1, s50, 11 +; MUBUF-NEXT: v_writelane_b32 v1, s51, 12 +; MUBUF-NEXT: v_writelane_b32 v1, s52, 13 +; MUBUF-NEXT: v_writelane_b32 v1, s53, 14 +; MUBUF-NEXT: v_writelane_b32 v1, s54, 15 +; MUBUF-NEXT: v_writelane_b32 v1, s55, 16 +; MUBUF-NEXT: v_writelane_b32 v1, s56, 17 +; MUBUF-NEXT: v_writelane_b32 v1, s57, 18 +; MUBUF-NEXT: v_writelane_b32 v1, s58, 19 +; MUBUF-NEXT: v_writelane_b32 v1, s59, 20 +; MUBUF-NEXT: v_writelane_b32 v1, s60, 21 +; MUBUF-NEXT: v_writelane_b32 v1, s61, 22 +; MUBUF-NEXT: v_writelane_b32 v1, s62, 23 +; MUBUF-NEXT: v_writelane_b32 v1, s63, 24 +; MUBUF-NEXT: v_writelane_b32 v1, s64, 25 +; MUBUF-NEXT: v_writelane_b32 v1, s65, 26 +; MUBUF-NEXT: v_writelane_b32 v1, s66, 27 +; MUBUF-NEXT: v_writelane_b32 v1, s67, 28 +; MUBUF-NEXT: v_writelane_b32 v1, s68, 29 +; MUBUF-NEXT: v_writelane_b32 v1, s69, 30 +; MUBUF-NEXT: v_writelane_b32 v1, s70, 31 +; MUBUF-NEXT: v_writelane_b32 v1, s71, 32 +; MUBUF-NEXT: v_writelane_b32 v1, s72, 33 +; MUBUF-NEXT: v_writelane_b32 v1, s73, 34 +; MUBUF-NEXT: v_writelane_b32 v1, s74, 35 +; MUBUF-NEXT: v_writelane_b32 v1, s75, 36 +; MUBUF-NEXT: v_writelane_b32 v1, s76, 37 +; MUBUF-NEXT: v_writelane_b32 v1, s77, 38 +; MUBUF-NEXT: v_writelane_b32 v1, s78, 39 +; MUBUF-NEXT: v_writelane_b32 v1, s79, 40 +; MUBUF-NEXT: v_writelane_b32 v1, s80, 41 +; MUBUF-NEXT: v_writelane_b32 v1, s81, 42 +; MUBUF-NEXT: v_writelane_b32 v1, s82, 43 +; MUBUF-NEXT: v_writelane_b32 v1, s83, 44 +; MUBUF-NEXT: v_writelane_b32 v1, s84, 45 +; MUBUF-NEXT: v_writelane_b32 v1, s85, 46 +; MUBUF-NEXT: v_writelane_b32 v1, s86, 47 +; MUBUF-NEXT: v_writelane_b32 v1, s87, 48 +; MUBUF-NEXT: v_writelane_b32 v1, s88, 49 +; MUBUF-NEXT: v_writelane_b32 v1, s89, 50 +; MUBUF-NEXT: v_writelane_b32 v1, s90, 51 +; MUBUF-NEXT: v_writelane_b32 v1, s91, 52 +; MUBUF-NEXT: v_writelane_b32 v1, s92, 53 +; MUBUF-NEXT: v_writelane_b32 v1, s93, 54 +; MUBUF-NEXT: v_writelane_b32 v1, s94, 55 +; MUBUF-NEXT: v_writelane_b32 v1, s95, 56 +; MUBUF-NEXT: v_writelane_b32 v1, s96, 57 +; MUBUF-NEXT: v_writelane_b32 v1, s97, 58 +; MUBUF-NEXT: v_writelane_b32 v1, s98, 59 +; MUBUF-NEXT: v_writelane_b32 v1, s99, 60 +; MUBUF-NEXT: v_writelane_b32 v1, s100, 61 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v1, s101, 62 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber v41 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_writelane_b32 v1, s102, 63 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_readlane_b32 s102, v1, 63 +; MUBUF-NEXT: v_readlane_b32 s101, v1, 62 +; MUBUF-NEXT: v_readlane_b32 s100, v1, 61 +; MUBUF-NEXT: v_readlane_b32 s99, v1, 60 +; MUBUF-NEXT: v_readlane_b32 s98, v1, 59 +; MUBUF-NEXT: v_readlane_b32 s97, v1, 58 +; MUBUF-NEXT: v_readlane_b32 s96, v1, 57 +; MUBUF-NEXT: v_readlane_b32 s95, v1, 56 +; MUBUF-NEXT: v_readlane_b32 s94, v1, 55 +; MUBUF-NEXT: v_readlane_b32 s93, v1, 54 +; MUBUF-NEXT: v_readlane_b32 s92, v1, 53 +; MUBUF-NEXT: v_readlane_b32 s91, v1, 52 +; MUBUF-NEXT: v_readlane_b32 s90, v1, 51 +; MUBUF-NEXT: v_readlane_b32 s89, v1, 50 +; MUBUF-NEXT: v_readlane_b32 s88, v1, 49 +; MUBUF-NEXT: v_readlane_b32 s87, v1, 48 +; MUBUF-NEXT: v_readlane_b32 s86, v1, 47 +; MUBUF-NEXT: v_readlane_b32 s85, v1, 46 +; MUBUF-NEXT: v_readlane_b32 s84, v1, 45 +; MUBUF-NEXT: v_readlane_b32 s83, v1, 44 +; MUBUF-NEXT: v_readlane_b32 s82, v1, 43 +; MUBUF-NEXT: v_readlane_b32 s81, v1, 42 +; MUBUF-NEXT: v_readlane_b32 s80, v1, 41 +; MUBUF-NEXT: v_readlane_b32 s79, v1, 40 +; MUBUF-NEXT: v_readlane_b32 s78, v1, 39 +; MUBUF-NEXT: v_readlane_b32 s77, v1, 38 +; MUBUF-NEXT: v_readlane_b32 s76, v1, 37 +; MUBUF-NEXT: v_readlane_b32 s75, v1, 36 +; MUBUF-NEXT: v_readlane_b32 s74, v1, 35 +; MUBUF-NEXT: v_readlane_b32 s73, v1, 34 +; MUBUF-NEXT: v_readlane_b32 s72, v1, 33 +; MUBUF-NEXT: v_readlane_b32 s71, v1, 32 +; MUBUF-NEXT: v_readlane_b32 s70, v1, 31 +; MUBUF-NEXT: v_readlane_b32 s69, v1, 30 +; MUBUF-NEXT: v_readlane_b32 s68, v1, 29 +; MUBUF-NEXT: v_readlane_b32 s67, v1, 28 +; MUBUF-NEXT: v_readlane_b32 s66, v1, 27 +; MUBUF-NEXT: v_readlane_b32 s65, v1, 26 +; MUBUF-NEXT: v_readlane_b32 s64, v1, 25 +; MUBUF-NEXT: v_readlane_b32 s63, v1, 24 +; MUBUF-NEXT: v_readlane_b32 s62, v1, 23 +; MUBUF-NEXT: v_readlane_b32 s61, v1, 22 +; MUBUF-NEXT: v_readlane_b32 s60, v1, 21 +; MUBUF-NEXT: v_readlane_b32 s59, v1, 20 +; MUBUF-NEXT: v_readlane_b32 s58, v1, 19 +; MUBUF-NEXT: v_readlane_b32 s57, v1, 18 +; MUBUF-NEXT: v_readlane_b32 s56, v1, 17 +; MUBUF-NEXT: v_readlane_b32 s55, v1, 16 +; MUBUF-NEXT: v_readlane_b32 s54, v1, 15 +; MUBUF-NEXT: v_readlane_b32 s53, v1, 14 +; MUBUF-NEXT: v_readlane_b32 s52, v1, 13 +; MUBUF-NEXT: v_readlane_b32 s51, v1, 12 +; MUBUF-NEXT: v_readlane_b32 s50, v1, 11 +; MUBUF-NEXT: v_readlane_b32 s49, v1, 10 +; MUBUF-NEXT: v_readlane_b32 s48, v1, 9 +; MUBUF-NEXT: v_readlane_b32 s47, v1, 8 +; MUBUF-NEXT: v_readlane_b32 s46, v1, 7 +; MUBUF-NEXT: v_readlane_b32 s45, v1, 6 +; MUBUF-NEXT: v_readlane_b32 s44, v1, 5 +; MUBUF-NEXT: v_readlane_b32 s43, v1, 4 +; MUBUF-NEXT: v_readlane_b32 s42, v1, 3 +; MUBUF-NEXT: v_readlane_b32 s41, v1, 2 +; MUBUF-NEXT: v_readlane_b32 s40, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s39, v1, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: no_new_vgpr_for_fp_csr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v1, s39, 0 +; FLATSCR-NEXT: v_writelane_b32 v1, s40, 1 +; FLATSCR-NEXT: v_writelane_b32 v1, s41, 2 +; FLATSCR-NEXT: v_writelane_b32 v1, s42, 3 +; FLATSCR-NEXT: v_writelane_b32 v1, s43, 4 +; FLATSCR-NEXT: v_writelane_b32 v1, s44, 5 +; FLATSCR-NEXT: v_writelane_b32 v1, s45, 6 +; FLATSCR-NEXT: v_writelane_b32 v1, s46, 7 +; FLATSCR-NEXT: v_writelane_b32 v1, s47, 8 +; FLATSCR-NEXT: v_writelane_b32 v1, s48, 9 +; FLATSCR-NEXT: v_writelane_b32 v1, s49, 10 +; FLATSCR-NEXT: v_writelane_b32 v1, s50, 11 +; FLATSCR-NEXT: v_writelane_b32 v1, s51, 12 +; FLATSCR-NEXT: v_writelane_b32 v1, s52, 13 +; FLATSCR-NEXT: v_writelane_b32 v1, s53, 14 +; FLATSCR-NEXT: v_writelane_b32 v1, s54, 15 +; FLATSCR-NEXT: v_writelane_b32 v1, s55, 16 +; FLATSCR-NEXT: v_writelane_b32 v1, s56, 17 +; FLATSCR-NEXT: v_writelane_b32 v1, s57, 18 +; FLATSCR-NEXT: v_writelane_b32 v1, s58, 19 +; FLATSCR-NEXT: v_writelane_b32 v1, s59, 20 +; FLATSCR-NEXT: v_writelane_b32 v1, s60, 21 +; FLATSCR-NEXT: v_writelane_b32 v1, s61, 22 +; FLATSCR-NEXT: v_writelane_b32 v1, s62, 23 +; FLATSCR-NEXT: v_writelane_b32 v1, s63, 24 +; FLATSCR-NEXT: v_writelane_b32 v1, s64, 25 +; FLATSCR-NEXT: v_writelane_b32 v1, s65, 26 +; FLATSCR-NEXT: v_writelane_b32 v1, s66, 27 +; FLATSCR-NEXT: v_writelane_b32 v1, s67, 28 +; FLATSCR-NEXT: v_writelane_b32 v1, s68, 29 +; FLATSCR-NEXT: v_writelane_b32 v1, s69, 30 +; FLATSCR-NEXT: v_writelane_b32 v1, s70, 31 +; FLATSCR-NEXT: v_writelane_b32 v1, s71, 32 +; FLATSCR-NEXT: v_writelane_b32 v1, s72, 33 +; FLATSCR-NEXT: v_writelane_b32 v1, s73, 34 +; FLATSCR-NEXT: v_writelane_b32 v1, s74, 35 +; FLATSCR-NEXT: v_writelane_b32 v1, s75, 36 +; FLATSCR-NEXT: v_writelane_b32 v1, s76, 37 +; FLATSCR-NEXT: v_writelane_b32 v1, s77, 38 +; FLATSCR-NEXT: v_writelane_b32 v1, s78, 39 +; FLATSCR-NEXT: v_writelane_b32 v1, s79, 40 +; FLATSCR-NEXT: v_writelane_b32 v1, s80, 41 +; FLATSCR-NEXT: v_writelane_b32 v1, s81, 42 +; FLATSCR-NEXT: v_writelane_b32 v1, s82, 43 +; FLATSCR-NEXT: v_writelane_b32 v1, s83, 44 +; FLATSCR-NEXT: v_writelane_b32 v1, s84, 45 +; FLATSCR-NEXT: v_writelane_b32 v1, s85, 46 +; FLATSCR-NEXT: v_writelane_b32 v1, s86, 47 +; FLATSCR-NEXT: v_writelane_b32 v1, s87, 48 +; FLATSCR-NEXT: v_writelane_b32 v1, s88, 49 +; FLATSCR-NEXT: v_writelane_b32 v1, s89, 50 +; FLATSCR-NEXT: v_writelane_b32 v1, s90, 51 +; FLATSCR-NEXT: v_writelane_b32 v1, s91, 52 +; FLATSCR-NEXT: v_writelane_b32 v1, s92, 53 +; FLATSCR-NEXT: v_writelane_b32 v1, s93, 54 +; FLATSCR-NEXT: v_writelane_b32 v1, s94, 55 +; FLATSCR-NEXT: v_writelane_b32 v1, s95, 56 +; FLATSCR-NEXT: v_writelane_b32 v1, s96, 57 +; FLATSCR-NEXT: v_writelane_b32 v1, s97, 58 +; FLATSCR-NEXT: v_writelane_b32 v1, s98, 59 +; FLATSCR-NEXT: v_writelane_b32 v1, s99, 60 +; FLATSCR-NEXT: v_writelane_b32 v1, s100, 61 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: v_writelane_b32 v1, s101, 62 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber v41 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_writelane_b32 v1, s102, 63 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_readlane_b32 s102, v1, 63 +; FLATSCR-NEXT: v_readlane_b32 s101, v1, 62 +; FLATSCR-NEXT: v_readlane_b32 s100, v1, 61 +; FLATSCR-NEXT: v_readlane_b32 s99, v1, 60 +; FLATSCR-NEXT: v_readlane_b32 s98, v1, 59 +; FLATSCR-NEXT: v_readlane_b32 s97, v1, 58 +; FLATSCR-NEXT: v_readlane_b32 s96, v1, 57 +; FLATSCR-NEXT: v_readlane_b32 s95, v1, 56 +; FLATSCR-NEXT: v_readlane_b32 s94, v1, 55 +; FLATSCR-NEXT: v_readlane_b32 s93, v1, 54 +; FLATSCR-NEXT: v_readlane_b32 s92, v1, 53 +; FLATSCR-NEXT: v_readlane_b32 s91, v1, 52 +; FLATSCR-NEXT: v_readlane_b32 s90, v1, 51 +; FLATSCR-NEXT: v_readlane_b32 s89, v1, 50 +; FLATSCR-NEXT: v_readlane_b32 s88, v1, 49 +; FLATSCR-NEXT: v_readlane_b32 s87, v1, 48 +; FLATSCR-NEXT: v_readlane_b32 s86, v1, 47 +; FLATSCR-NEXT: v_readlane_b32 s85, v1, 46 +; FLATSCR-NEXT: v_readlane_b32 s84, v1, 45 +; FLATSCR-NEXT: v_readlane_b32 s83, v1, 44 +; FLATSCR-NEXT: v_readlane_b32 s82, v1, 43 +; FLATSCR-NEXT: v_readlane_b32 s81, v1, 42 +; FLATSCR-NEXT: v_readlane_b32 s80, v1, 41 +; FLATSCR-NEXT: v_readlane_b32 s79, v1, 40 +; FLATSCR-NEXT: v_readlane_b32 s78, v1, 39 +; FLATSCR-NEXT: v_readlane_b32 s77, v1, 38 +; FLATSCR-NEXT: v_readlane_b32 s76, v1, 37 +; FLATSCR-NEXT: v_readlane_b32 s75, v1, 36 +; FLATSCR-NEXT: v_readlane_b32 s74, v1, 35 +; FLATSCR-NEXT: v_readlane_b32 s73, v1, 34 +; FLATSCR-NEXT: v_readlane_b32 s72, v1, 33 +; FLATSCR-NEXT: v_readlane_b32 s71, v1, 32 +; FLATSCR-NEXT: v_readlane_b32 s70, v1, 31 +; FLATSCR-NEXT: v_readlane_b32 s69, v1, 30 +; FLATSCR-NEXT: v_readlane_b32 s68, v1, 29 +; FLATSCR-NEXT: v_readlane_b32 s67, v1, 28 +; FLATSCR-NEXT: v_readlane_b32 s66, v1, 27 +; FLATSCR-NEXT: v_readlane_b32 s65, v1, 26 +; FLATSCR-NEXT: v_readlane_b32 s64, v1, 25 +; FLATSCR-NEXT: v_readlane_b32 s63, v1, 24 +; FLATSCR-NEXT: v_readlane_b32 s62, v1, 23 +; FLATSCR-NEXT: v_readlane_b32 s61, v1, 22 +; FLATSCR-NEXT: v_readlane_b32 s60, v1, 21 +; FLATSCR-NEXT: v_readlane_b32 s59, v1, 20 +; FLATSCR-NEXT: v_readlane_b32 s58, v1, 19 +; FLATSCR-NEXT: v_readlane_b32 s57, v1, 18 +; FLATSCR-NEXT: v_readlane_b32 s56, v1, 17 +; FLATSCR-NEXT: v_readlane_b32 s55, v1, 16 +; FLATSCR-NEXT: v_readlane_b32 s54, v1, 15 +; FLATSCR-NEXT: v_readlane_b32 s53, v1, 14 +; FLATSCR-NEXT: v_readlane_b32 s52, v1, 13 +; FLATSCR-NEXT: v_readlane_b32 s51, v1, 12 +; FLATSCR-NEXT: v_readlane_b32 s50, v1, 11 +; FLATSCR-NEXT: v_readlane_b32 s49, v1, 10 +; FLATSCR-NEXT: v_readlane_b32 s48, v1, 9 +; FLATSCR-NEXT: v_readlane_b32 s47, v1, 8 +; FLATSCR-NEXT: v_readlane_b32 s46, v1, 7 +; FLATSCR-NEXT: v_readlane_b32 s45, v1, 6 +; FLATSCR-NEXT: v_readlane_b32 s44, v1, 5 +; FLATSCR-NEXT: v_readlane_b32 s43, v1, 4 +; FLATSCR-NEXT: v_readlane_b32 s42, v1, 3 +; FLATSCR-NEXT: v_readlane_b32 s41, v1, 2 +; FLATSCR-NEXT: v_readlane_b32 s40, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s39, v1, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:8 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void asm sideeffect "; clobber v41", "~{v41}"() @@ -356,64 +1301,99 @@ define void @no_new_vgpr_for_fp_csr() #1 { ret void } -; GCN-LABEL: {{^}}realign_stack_no_fp_elim: -; GCN: s_waitcnt -; MUBUF-NEXT: s_mov_b32 [[FP_COPY:s4]], s33 -; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33 -; MUBUF-NEXT: s_add_i32 s33, s32, 0x7ffc0 -; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff -; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 -; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 -; MUBUF-NEXT: s_mov_b32 s5, s34 -; FLATSCR-NEXT: s_mov_b32 s1, s34 -; MUBUF-NEXT: s_mov_b32 s34, s32 -; FLATSCR-NEXT: s_mov_b32 s34, s32 -; MUBUF-NEXT: s_add_i32 s32, s32, 0x180000 -; FLATSCR-NEXT: s_addk_i32 s32, 0x6000 -; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; MUBUF-NEXT: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x2000{{$}} -; MUBUF-NEXT: buffer_store_dword [[ZERO]], [[OFFSET]], s[0:3], s33 offen{{$}} -; FLATSCR-NEXT: s_add_i32 s2, s33, 0x2000 -; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_mov_b32 s32, s34 -; MUBUF-NEXT: s_mov_b32 s34, s5 -; FLATSCR-NEXT: s_mov_b32 s32, s34 -; FLATSCR-NEXT: s_mov_b32 s34, s1 -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_setpc_b64 define void @realign_stack_no_fp_elim() #1 { +; MUBUF-LABEL: realign_stack_no_fp_elim: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_add_i32 s33, s32, 0x7ffc0 +; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 +; MUBUF-NEXT: s_mov_b32 s5, s34 +; MUBUF-NEXT: s_mov_b32 s34, s32 +; MUBUF-NEXT: s_add_i32 s32, s32, 0x180000 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_mov_b32 s32, s34 +; MUBUF-NEXT: s_mov_b32 s34, s5 +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: realign_stack_no_fp_elim: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff +; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 +; FLATSCR-NEXT: s_mov_b32 s1, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s32 +; FLATSCR-NEXT: s_addk_i32 s32, 0x6000 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s2, s33, 0x2000 +; FLATSCR-NEXT: scratch_store_dword off, v0, s2 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s32, s34 +; FLATSCR-NEXT: s_mov_b32 s34, s1 +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 8192, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca ret void } -; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: -; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 vcc_lo, s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0 -; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN: v_writelane_b32 [[CSR_VGPR]], s31, 1 -; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} -; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s31, [[CSR_VGPR]], 1 -; GCN: v_readlane_b32 s30, [[CSR_VGPR]], 0 -;GCN-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, vcc_lo -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] define void @no_unused_non_csr_sgpr_for_fp() #1 { +; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v1, s30, 0 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: v_writelane_b32 v1, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s31, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v1, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v1, s30, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s32, s32, 12 +; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -428,31 +1408,64 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { } ; Need a new CSR VGPR to satisfy the FP spill. -; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: -; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 vcc_lo, s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_mov_b32_e32 - -; MUBUF: s_addk_i32 s32, 0x300{{$}} -; FLATSCR: s_add_i32 s32, s32, 12{{$}} -; MUBUF-DAG: buffer_store_dword -; FLATSCR-DAG: scratch_store_dword - -; GCN: ;;#ASMSTART -; GCN: s_mov_b32 s32, s33 -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, vcc_lo -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { +; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved initial VGPRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s32, s32, 12 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved initial VGPRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -474,32 +1487,72 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; The byval argument exceeds the MUBUF constant offset, so a scratch ; register is needed to access the CSR VGPR slot. -; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset: -; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 vcc_lo, s33 -; GCN-DAG: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 -; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; MUBUF-DAG: s_add_i32 s32, s32, 0x40300{{$}} -; FLATSCR-DAG: s_addk_i32 s32, 0x100c{{$}} -; MUBUF-DAG: buffer_store_dword -; FLATSCR-DAG: scratch_store_dword - -; GCN: ;;#ASMSTART -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload -; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, vcc_lo -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #1 { +; MUBUF-LABEL: scratch_reg_needed_mubuf_offset: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 vcc_lo, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: s_add_i32 s6, s33, 0x40100 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s6 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1000 +; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved SGPRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved VGPRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: s_add_i32 s6, s33, 0x40100 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s6 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_mov_b32 s33, vcc_lo +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: scratch_reg_needed_mubuf_offset: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 vcc_lo, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004 +; FLATSCR-NEXT: scratch_store_dword off, v40, s2 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_addk_i32 s32, 0x100c +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s0, s33, 0x1000 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved SGPRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved VGPRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004 +; FLATSCR-NEXT: scratch_load_dword v40, off, s2 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_mov_b32 s33, vcc_lo +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -520,25 +1573,72 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) ret void } -; GCN-LABEL: {{^}}local_empty_func: -; GCN: s_waitcnt -; GCN-NEXT: s_setpc_b64 define internal void @local_empty_func() #0 { +; GCN-LABEL: local_empty_func: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ret void } ; An FP is needed, despite not needing any spills ; TODO: Ccould see callee does not use stack and omit FP. -; GCN-LABEL: {{^}}ipra_call_with_stack: -; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33 -; GCN: s_mov_b32 s33, s32 -; MUBUF: s_addk_i32 s32, 0x400 -; FLATSCR: s_add_i32 s32, s32, 16 -; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}} -; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33{{$}} -; GCN: s_swappc_b64 -; GCN: s_mov_b32 s33, [[TMP_SGPR]] define void @ipra_call_with_stack() #0 { +; MUBUF-LABEL: ipra_call_with_stack: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s18, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[16:17] +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v1, s30, 0 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: v_writelane_b32 v1, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_getpc_b64 s[16:17] +; MUBUF-NEXT: s_add_u32 s16, s16, local_empty_func@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s17, s17, local_empty_func@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] +; MUBUF-NEXT: v_readlane_b32 s31, v1, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v1, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_mov_b32 s33, s18 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: ipra_call_with_stack: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s2, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v1, s30, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, local_empty_func@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, local_empty_func@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_mov_b32 s33, s2 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void @local_empty_func() @@ -546,21 +1646,41 @@ define void @ipra_call_with_stack() #0 { } ; With no free registers, we must spill the FP to memory. -; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory: -; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; FLATSCR: s_mov_b32 s0, s33 -; GCN: s_mov_b32 s33, s32 -; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], [[FP_SCRATCH_COPY]] -; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF: s_waitcnt vmcnt(0) -; MUBUF: v_readfirstlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[TMP_VGPR2]] -; MUBUF: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; FLATSCR: s_mov_b32 s33, s0 -; GCN: s_setpc_b64 -; MUBUF: ScratchSize: 8 -; FLATSCR: ScratchSize: 0 define void @callee_need_to_spill_fp_to_memory() #3 { +; MUBUF-LABEL: callee_need_to_spill_fp_to_memory: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved SGPRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber all VGPRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_addk_i32 s32, 0x200 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_need_to_spill_fp_to_memory: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved SGPRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber all VGPRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber nonpreserved SGPRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} @@ -578,23 +1698,313 @@ define void @callee_need_to_spill_fp_to_memory() #3 { ; If we have a reserved VGPR that can be used for SGPR spills, we may still ; need to spill the FP to memory if there are no free lanes in the reserved ; VGPR. -; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: -; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_mov_b32 s33, s32 -; MUBUF: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] -; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], [[FP_SCRATCH_COPY]] -; MUBUF: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s33 offset:[[OFF:[0-9]+]] -; GCN-NOT: v_writelane_b32 v40, s33 -; GCN-NOT: v_readlane_b32 s33, v40 -; GCN-NOT: v_readlane_b32 s33, v40 -; MUBUF: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s33 offset:[[OFF]] -; MUBUF: v_readfirstlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[TMP_VGPR2]] -; MUBUF: s_xor_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; MUBUF: s_mov_b64 exec, [[COPY_EXEC2]] -; MUBUF: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { +; MUBUF-LABEL: callee_need_to_spill_fp_to_memory_full_reserved_vgpr: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v39, s39, 0 +; MUBUF-NEXT: v_writelane_b32 v39, s40, 1 +; MUBUF-NEXT: v_writelane_b32 v39, s41, 2 +; MUBUF-NEXT: v_writelane_b32 v39, s42, 3 +; MUBUF-NEXT: v_writelane_b32 v39, s43, 4 +; MUBUF-NEXT: v_writelane_b32 v39, s44, 5 +; MUBUF-NEXT: v_writelane_b32 v39, s45, 6 +; MUBUF-NEXT: v_writelane_b32 v39, s46, 7 +; MUBUF-NEXT: v_writelane_b32 v39, s47, 8 +; MUBUF-NEXT: v_writelane_b32 v39, s48, 9 +; MUBUF-NEXT: v_writelane_b32 v39, s49, 10 +; MUBUF-NEXT: v_writelane_b32 v39, s50, 11 +; MUBUF-NEXT: v_writelane_b32 v39, s51, 12 +; MUBUF-NEXT: v_writelane_b32 v39, s52, 13 +; MUBUF-NEXT: v_writelane_b32 v39, s53, 14 +; MUBUF-NEXT: v_writelane_b32 v39, s54, 15 +; MUBUF-NEXT: v_writelane_b32 v39, s55, 16 +; MUBUF-NEXT: v_writelane_b32 v39, s56, 17 +; MUBUF-NEXT: v_writelane_b32 v39, s57, 18 +; MUBUF-NEXT: v_writelane_b32 v39, s58, 19 +; MUBUF-NEXT: v_writelane_b32 v39, s59, 20 +; MUBUF-NEXT: v_writelane_b32 v39, s60, 21 +; MUBUF-NEXT: v_writelane_b32 v39, s61, 22 +; MUBUF-NEXT: v_writelane_b32 v39, s62, 23 +; MUBUF-NEXT: v_writelane_b32 v39, s63, 24 +; MUBUF-NEXT: v_writelane_b32 v39, s64, 25 +; MUBUF-NEXT: v_writelane_b32 v39, s65, 26 +; MUBUF-NEXT: v_writelane_b32 v39, s66, 27 +; MUBUF-NEXT: v_writelane_b32 v39, s67, 28 +; MUBUF-NEXT: v_writelane_b32 v39, s68, 29 +; MUBUF-NEXT: v_writelane_b32 v39, s69, 30 +; MUBUF-NEXT: v_writelane_b32 v39, s70, 31 +; MUBUF-NEXT: v_writelane_b32 v39, s71, 32 +; MUBUF-NEXT: v_writelane_b32 v39, s72, 33 +; MUBUF-NEXT: v_writelane_b32 v39, s73, 34 +; MUBUF-NEXT: v_writelane_b32 v39, s74, 35 +; MUBUF-NEXT: v_writelane_b32 v39, s75, 36 +; MUBUF-NEXT: v_writelane_b32 v39, s76, 37 +; MUBUF-NEXT: v_writelane_b32 v39, s77, 38 +; MUBUF-NEXT: v_writelane_b32 v39, s78, 39 +; MUBUF-NEXT: v_writelane_b32 v39, s79, 40 +; MUBUF-NEXT: v_writelane_b32 v39, s80, 41 +; MUBUF-NEXT: v_writelane_b32 v39, s81, 42 +; MUBUF-NEXT: v_writelane_b32 v39, s82, 43 +; MUBUF-NEXT: v_writelane_b32 v39, s83, 44 +; MUBUF-NEXT: v_writelane_b32 v39, s84, 45 +; MUBUF-NEXT: v_writelane_b32 v39, s85, 46 +; MUBUF-NEXT: v_writelane_b32 v39, s86, 47 +; MUBUF-NEXT: v_writelane_b32 v39, s87, 48 +; MUBUF-NEXT: v_writelane_b32 v39, s88, 49 +; MUBUF-NEXT: v_writelane_b32 v39, s89, 50 +; MUBUF-NEXT: v_writelane_b32 v39, s90, 51 +; MUBUF-NEXT: v_writelane_b32 v39, s91, 52 +; MUBUF-NEXT: v_writelane_b32 v39, s92, 53 +; MUBUF-NEXT: v_writelane_b32 v39, s93, 54 +; MUBUF-NEXT: v_writelane_b32 v39, s94, 55 +; MUBUF-NEXT: v_writelane_b32 v39, s95, 56 +; MUBUF-NEXT: v_writelane_b32 v39, s96, 57 +; MUBUF-NEXT: v_writelane_b32 v39, s97, 58 +; MUBUF-NEXT: v_writelane_b32 v39, s98, 59 +; MUBUF-NEXT: v_writelane_b32 v39, s99, 60 +; MUBUF-NEXT: v_writelane_b32 v39, s100, 61 +; MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; MUBUF-NEXT: v_writelane_b32 v39, s101, 62 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v39, s102, 63 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: v_readlane_b32 s102, v39, 63 +; MUBUF-NEXT: v_readlane_b32 s101, v39, 62 +; MUBUF-NEXT: v_readlane_b32 s100, v39, 61 +; MUBUF-NEXT: v_readlane_b32 s99, v39, 60 +; MUBUF-NEXT: v_readlane_b32 s98, v39, 59 +; MUBUF-NEXT: v_readlane_b32 s97, v39, 58 +; MUBUF-NEXT: v_readlane_b32 s96, v39, 57 +; MUBUF-NEXT: v_readlane_b32 s95, v39, 56 +; MUBUF-NEXT: v_readlane_b32 s94, v39, 55 +; MUBUF-NEXT: v_readlane_b32 s93, v39, 54 +; MUBUF-NEXT: v_readlane_b32 s92, v39, 53 +; MUBUF-NEXT: v_readlane_b32 s91, v39, 52 +; MUBUF-NEXT: v_readlane_b32 s90, v39, 51 +; MUBUF-NEXT: v_readlane_b32 s89, v39, 50 +; MUBUF-NEXT: v_readlane_b32 s88, v39, 49 +; MUBUF-NEXT: v_readlane_b32 s87, v39, 48 +; MUBUF-NEXT: v_readlane_b32 s86, v39, 47 +; MUBUF-NEXT: v_readlane_b32 s85, v39, 46 +; MUBUF-NEXT: v_readlane_b32 s84, v39, 45 +; MUBUF-NEXT: v_readlane_b32 s83, v39, 44 +; MUBUF-NEXT: v_readlane_b32 s82, v39, 43 +; MUBUF-NEXT: v_readlane_b32 s81, v39, 42 +; MUBUF-NEXT: v_readlane_b32 s80, v39, 41 +; MUBUF-NEXT: v_readlane_b32 s79, v39, 40 +; MUBUF-NEXT: v_readlane_b32 s78, v39, 39 +; MUBUF-NEXT: v_readlane_b32 s77, v39, 38 +; MUBUF-NEXT: v_readlane_b32 s76, v39, 37 +; MUBUF-NEXT: v_readlane_b32 s75, v39, 36 +; MUBUF-NEXT: v_readlane_b32 s74, v39, 35 +; MUBUF-NEXT: v_readlane_b32 s73, v39, 34 +; MUBUF-NEXT: v_readlane_b32 s72, v39, 33 +; MUBUF-NEXT: v_readlane_b32 s71, v39, 32 +; MUBUF-NEXT: v_readlane_b32 s70, v39, 31 +; MUBUF-NEXT: v_readlane_b32 s69, v39, 30 +; MUBUF-NEXT: v_readlane_b32 s68, v39, 29 +; MUBUF-NEXT: v_readlane_b32 s67, v39, 28 +; MUBUF-NEXT: v_readlane_b32 s66, v39, 27 +; MUBUF-NEXT: v_readlane_b32 s65, v39, 26 +; MUBUF-NEXT: v_readlane_b32 s64, v39, 25 +; MUBUF-NEXT: v_readlane_b32 s63, v39, 24 +; MUBUF-NEXT: v_readlane_b32 s62, v39, 23 +; MUBUF-NEXT: v_readlane_b32 s61, v39, 22 +; MUBUF-NEXT: v_readlane_b32 s60, v39, 21 +; MUBUF-NEXT: v_readlane_b32 s59, v39, 20 +; MUBUF-NEXT: v_readlane_b32 s58, v39, 19 +; MUBUF-NEXT: v_readlane_b32 s57, v39, 18 +; MUBUF-NEXT: v_readlane_b32 s56, v39, 17 +; MUBUF-NEXT: v_readlane_b32 s55, v39, 16 +; MUBUF-NEXT: v_readlane_b32 s54, v39, 15 +; MUBUF-NEXT: v_readlane_b32 s53, v39, 14 +; MUBUF-NEXT: v_readlane_b32 s52, v39, 13 +; MUBUF-NEXT: v_readlane_b32 s51, v39, 12 +; MUBUF-NEXT: v_readlane_b32 s50, v39, 11 +; MUBUF-NEXT: v_readlane_b32 s49, v39, 10 +; MUBUF-NEXT: v_readlane_b32 s48, v39, 9 +; MUBUF-NEXT: v_readlane_b32 s47, v39, 8 +; MUBUF-NEXT: v_readlane_b32 s46, v39, 7 +; MUBUF-NEXT: v_readlane_b32 s45, v39, 6 +; MUBUF-NEXT: v_readlane_b32 s44, v39, 5 +; MUBUF-NEXT: v_readlane_b32 s43, v39, 4 +; MUBUF-NEXT: v_readlane_b32 s42, v39, 3 +; MUBUF-NEXT: v_readlane_b32 s41, v39, 2 +; MUBUF-NEXT: v_readlane_b32 s40, v39, 1 +; MUBUF-NEXT: v_readlane_b32 s39, v39, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v39, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_need_to_spill_fp_to_memory_full_reserved_vgpr: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v39, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v39, s39, 0 +; FLATSCR-NEXT: v_writelane_b32 v39, s40, 1 +; FLATSCR-NEXT: v_writelane_b32 v39, s41, 2 +; FLATSCR-NEXT: v_writelane_b32 v39, s42, 3 +; FLATSCR-NEXT: v_writelane_b32 v39, s43, 4 +; FLATSCR-NEXT: v_writelane_b32 v39, s44, 5 +; FLATSCR-NEXT: v_writelane_b32 v39, s45, 6 +; FLATSCR-NEXT: v_writelane_b32 v39, s46, 7 +; FLATSCR-NEXT: v_writelane_b32 v39, s47, 8 +; FLATSCR-NEXT: v_writelane_b32 v39, s48, 9 +; FLATSCR-NEXT: v_writelane_b32 v39, s49, 10 +; FLATSCR-NEXT: v_writelane_b32 v39, s50, 11 +; FLATSCR-NEXT: v_writelane_b32 v39, s51, 12 +; FLATSCR-NEXT: v_writelane_b32 v39, s52, 13 +; FLATSCR-NEXT: v_writelane_b32 v39, s53, 14 +; FLATSCR-NEXT: v_writelane_b32 v39, s54, 15 +; FLATSCR-NEXT: v_writelane_b32 v39, s55, 16 +; FLATSCR-NEXT: v_writelane_b32 v39, s56, 17 +; FLATSCR-NEXT: v_writelane_b32 v39, s57, 18 +; FLATSCR-NEXT: v_writelane_b32 v39, s58, 19 +; FLATSCR-NEXT: v_writelane_b32 v39, s59, 20 +; FLATSCR-NEXT: v_writelane_b32 v39, s60, 21 +; FLATSCR-NEXT: v_writelane_b32 v39, s61, 22 +; FLATSCR-NEXT: v_writelane_b32 v39, s62, 23 +; FLATSCR-NEXT: v_writelane_b32 v39, s63, 24 +; FLATSCR-NEXT: v_writelane_b32 v39, s64, 25 +; FLATSCR-NEXT: v_writelane_b32 v39, s65, 26 +; FLATSCR-NEXT: v_writelane_b32 v39, s66, 27 +; FLATSCR-NEXT: v_writelane_b32 v39, s67, 28 +; FLATSCR-NEXT: v_writelane_b32 v39, s68, 29 +; FLATSCR-NEXT: v_writelane_b32 v39, s69, 30 +; FLATSCR-NEXT: v_writelane_b32 v39, s70, 31 +; FLATSCR-NEXT: v_writelane_b32 v39, s71, 32 +; FLATSCR-NEXT: v_writelane_b32 v39, s72, 33 +; FLATSCR-NEXT: v_writelane_b32 v39, s73, 34 +; FLATSCR-NEXT: v_writelane_b32 v39, s74, 35 +; FLATSCR-NEXT: v_writelane_b32 v39, s75, 36 +; FLATSCR-NEXT: v_writelane_b32 v39, s76, 37 +; FLATSCR-NEXT: v_writelane_b32 v39, s77, 38 +; FLATSCR-NEXT: v_writelane_b32 v39, s78, 39 +; FLATSCR-NEXT: v_writelane_b32 v39, s79, 40 +; FLATSCR-NEXT: v_writelane_b32 v39, s80, 41 +; FLATSCR-NEXT: v_writelane_b32 v39, s81, 42 +; FLATSCR-NEXT: v_writelane_b32 v39, s82, 43 +; FLATSCR-NEXT: v_writelane_b32 v39, s83, 44 +; FLATSCR-NEXT: v_writelane_b32 v39, s84, 45 +; FLATSCR-NEXT: v_writelane_b32 v39, s85, 46 +; FLATSCR-NEXT: v_writelane_b32 v39, s86, 47 +; FLATSCR-NEXT: v_writelane_b32 v39, s87, 48 +; FLATSCR-NEXT: v_writelane_b32 v39, s88, 49 +; FLATSCR-NEXT: v_writelane_b32 v39, s89, 50 +; FLATSCR-NEXT: v_writelane_b32 v39, s90, 51 +; FLATSCR-NEXT: v_writelane_b32 v39, s91, 52 +; FLATSCR-NEXT: v_writelane_b32 v39, s92, 53 +; FLATSCR-NEXT: v_writelane_b32 v39, s93, 54 +; FLATSCR-NEXT: v_writelane_b32 v39, s94, 55 +; FLATSCR-NEXT: v_writelane_b32 v39, s95, 56 +; FLATSCR-NEXT: v_writelane_b32 v39, s96, 57 +; FLATSCR-NEXT: v_writelane_b32 v39, s97, 58 +; FLATSCR-NEXT: v_writelane_b32 v39, s98, 59 +; FLATSCR-NEXT: v_writelane_b32 v39, s99, 60 +; FLATSCR-NEXT: v_writelane_b32 v39, s100, 61 +; FLATSCR-NEXT: v_writelane_b32 v39, s101, 62 +; FLATSCR-NEXT: s_add_i32 s32, s32, 8 +; FLATSCR-NEXT: v_writelane_b32 v39, s102, 63 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s102, v39, 63 +; FLATSCR-NEXT: v_readlane_b32 s101, v39, 62 +; FLATSCR-NEXT: v_readlane_b32 s100, v39, 61 +; FLATSCR-NEXT: v_readlane_b32 s99, v39, 60 +; FLATSCR-NEXT: v_readlane_b32 s98, v39, 59 +; FLATSCR-NEXT: v_readlane_b32 s97, v39, 58 +; FLATSCR-NEXT: v_readlane_b32 s96, v39, 57 +; FLATSCR-NEXT: v_readlane_b32 s95, v39, 56 +; FLATSCR-NEXT: v_readlane_b32 s94, v39, 55 +; FLATSCR-NEXT: v_readlane_b32 s93, v39, 54 +; FLATSCR-NEXT: v_readlane_b32 s92, v39, 53 +; FLATSCR-NEXT: v_readlane_b32 s91, v39, 52 +; FLATSCR-NEXT: v_readlane_b32 s90, v39, 51 +; FLATSCR-NEXT: v_readlane_b32 s89, v39, 50 +; FLATSCR-NEXT: v_readlane_b32 s88, v39, 49 +; FLATSCR-NEXT: v_readlane_b32 s87, v39, 48 +; FLATSCR-NEXT: v_readlane_b32 s86, v39, 47 +; FLATSCR-NEXT: v_readlane_b32 s85, v39, 46 +; FLATSCR-NEXT: v_readlane_b32 s84, v39, 45 +; FLATSCR-NEXT: v_readlane_b32 s83, v39, 44 +; FLATSCR-NEXT: v_readlane_b32 s82, v39, 43 +; FLATSCR-NEXT: v_readlane_b32 s81, v39, 42 +; FLATSCR-NEXT: v_readlane_b32 s80, v39, 41 +; FLATSCR-NEXT: v_readlane_b32 s79, v39, 40 +; FLATSCR-NEXT: v_readlane_b32 s78, v39, 39 +; FLATSCR-NEXT: v_readlane_b32 s77, v39, 38 +; FLATSCR-NEXT: v_readlane_b32 s76, v39, 37 +; FLATSCR-NEXT: v_readlane_b32 s75, v39, 36 +; FLATSCR-NEXT: v_readlane_b32 s74, v39, 35 +; FLATSCR-NEXT: v_readlane_b32 s73, v39, 34 +; FLATSCR-NEXT: v_readlane_b32 s72, v39, 33 +; FLATSCR-NEXT: v_readlane_b32 s71, v39, 32 +; FLATSCR-NEXT: v_readlane_b32 s70, v39, 31 +; FLATSCR-NEXT: v_readlane_b32 s69, v39, 30 +; FLATSCR-NEXT: v_readlane_b32 s68, v39, 29 +; FLATSCR-NEXT: v_readlane_b32 s67, v39, 28 +; FLATSCR-NEXT: v_readlane_b32 s66, v39, 27 +; FLATSCR-NEXT: v_readlane_b32 s65, v39, 26 +; FLATSCR-NEXT: v_readlane_b32 s64, v39, 25 +; FLATSCR-NEXT: v_readlane_b32 s63, v39, 24 +; FLATSCR-NEXT: v_readlane_b32 s62, v39, 23 +; FLATSCR-NEXT: v_readlane_b32 s61, v39, 22 +; FLATSCR-NEXT: v_readlane_b32 s60, v39, 21 +; FLATSCR-NEXT: v_readlane_b32 s59, v39, 20 +; FLATSCR-NEXT: v_readlane_b32 s58, v39, 19 +; FLATSCR-NEXT: v_readlane_b32 s57, v39, 18 +; FLATSCR-NEXT: v_readlane_b32 s56, v39, 17 +; FLATSCR-NEXT: v_readlane_b32 s55, v39, 16 +; FLATSCR-NEXT: v_readlane_b32 s54, v39, 15 +; FLATSCR-NEXT: v_readlane_b32 s53, v39, 14 +; FLATSCR-NEXT: v_readlane_b32 s52, v39, 13 +; FLATSCR-NEXT: v_readlane_b32 s51, v39, 12 +; FLATSCR-NEXT: v_readlane_b32 s50, v39, 11 +; FLATSCR-NEXT: v_readlane_b32 s49, v39, 10 +; FLATSCR-NEXT: v_readlane_b32 s48, v39, 9 +; FLATSCR-NEXT: v_readlane_b32 s47, v39, 8 +; FLATSCR-NEXT: v_readlane_b32 s46, v39, 7 +; FLATSCR-NEXT: v_readlane_b32 s45, v39, 6 +; FLATSCR-NEXT: v_readlane_b32 s44, v39, 5 +; FLATSCR-NEXT: v_readlane_b32 s43, v39, 4 +; FLATSCR-NEXT: v_readlane_b32 s42, v39, 3 +; FLATSCR-NEXT: v_readlane_b32 s41, v39, 2 +; FLATSCR-NEXT: v_readlane_b32 s40, v39, 1 +; FLATSCR-NEXT: v_readlane_b32 s39, v39, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v39, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} @@ -619,17 +2029,312 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { ; the exec register is saved to s0 when saving CSR in the function prolog. ; Make sure that the FP save happens after restoring exec from the same ; register. -; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg: -; FLATSCR: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; FLATSCR: s_mov_b32 s33, s32 -; GCN-NOT: v_writelane_b32 v40, s33 -; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; FLATSCR: s_mov_b64 exec, [[COPY_EXEC0]] -; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NOT: v_readlane_b32 s33, v40 -; FLATSCR: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_reg() #1 { +; MUBUF-LABEL: callee_need_to_spill_fp_to_reg: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s39, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s40, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s41, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s42, 3 +; MUBUF-NEXT: v_writelane_b32 v40, s43, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s44, 5 +; MUBUF-NEXT: v_writelane_b32 v40, s45, 6 +; MUBUF-NEXT: v_writelane_b32 v40, s46, 7 +; MUBUF-NEXT: v_writelane_b32 v40, s47, 8 +; MUBUF-NEXT: v_writelane_b32 v40, s48, 9 +; MUBUF-NEXT: v_writelane_b32 v40, s49, 10 +; MUBUF-NEXT: v_writelane_b32 v40, s50, 11 +; MUBUF-NEXT: v_writelane_b32 v40, s51, 12 +; MUBUF-NEXT: v_writelane_b32 v40, s52, 13 +; MUBUF-NEXT: v_writelane_b32 v40, s53, 14 +; MUBUF-NEXT: v_writelane_b32 v40, s54, 15 +; MUBUF-NEXT: v_writelane_b32 v40, s55, 16 +; MUBUF-NEXT: v_writelane_b32 v40, s56, 17 +; MUBUF-NEXT: v_writelane_b32 v40, s57, 18 +; MUBUF-NEXT: v_writelane_b32 v40, s58, 19 +; MUBUF-NEXT: v_writelane_b32 v40, s59, 20 +; MUBUF-NEXT: v_writelane_b32 v40, s60, 21 +; MUBUF-NEXT: v_writelane_b32 v40, s61, 22 +; MUBUF-NEXT: v_writelane_b32 v40, s62, 23 +; MUBUF-NEXT: v_writelane_b32 v40, s63, 24 +; MUBUF-NEXT: v_writelane_b32 v40, s64, 25 +; MUBUF-NEXT: v_writelane_b32 v40, s65, 26 +; MUBUF-NEXT: v_writelane_b32 v40, s66, 27 +; MUBUF-NEXT: v_writelane_b32 v40, s67, 28 +; MUBUF-NEXT: v_writelane_b32 v40, s68, 29 +; MUBUF-NEXT: v_writelane_b32 v40, s69, 30 +; MUBUF-NEXT: v_writelane_b32 v40, s70, 31 +; MUBUF-NEXT: v_writelane_b32 v40, s71, 32 +; MUBUF-NEXT: v_writelane_b32 v40, s72, 33 +; MUBUF-NEXT: v_writelane_b32 v40, s73, 34 +; MUBUF-NEXT: v_writelane_b32 v40, s74, 35 +; MUBUF-NEXT: v_writelane_b32 v40, s75, 36 +; MUBUF-NEXT: v_writelane_b32 v40, s76, 37 +; MUBUF-NEXT: v_writelane_b32 v40, s77, 38 +; MUBUF-NEXT: v_writelane_b32 v40, s78, 39 +; MUBUF-NEXT: v_writelane_b32 v40, s79, 40 +; MUBUF-NEXT: v_writelane_b32 v40, s80, 41 +; MUBUF-NEXT: v_writelane_b32 v40, s81, 42 +; MUBUF-NEXT: v_writelane_b32 v40, s82, 43 +; MUBUF-NEXT: v_writelane_b32 v40, s83, 44 +; MUBUF-NEXT: v_writelane_b32 v40, s84, 45 +; MUBUF-NEXT: v_writelane_b32 v40, s85, 46 +; MUBUF-NEXT: v_writelane_b32 v40, s86, 47 +; MUBUF-NEXT: v_writelane_b32 v40, s87, 48 +; MUBUF-NEXT: v_writelane_b32 v40, s88, 49 +; MUBUF-NEXT: v_writelane_b32 v40, s89, 50 +; MUBUF-NEXT: v_writelane_b32 v40, s90, 51 +; MUBUF-NEXT: v_writelane_b32 v40, s91, 52 +; MUBUF-NEXT: v_writelane_b32 v40, s92, 53 +; MUBUF-NEXT: v_writelane_b32 v40, s93, 54 +; MUBUF-NEXT: v_writelane_b32 v40, s94, 55 +; MUBUF-NEXT: v_writelane_b32 v40, s95, 56 +; MUBUF-NEXT: v_writelane_b32 v40, s96, 57 +; MUBUF-NEXT: v_writelane_b32 v40, s97, 58 +; MUBUF-NEXT: v_writelane_b32 v40, s98, 59 +; MUBUF-NEXT: v_writelane_b32 v40, s99, 60 +; MUBUF-NEXT: v_writelane_b32 v40, s100, 61 +; MUBUF-NEXT: v_writelane_b32 v40, s101, 62 +; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 +; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: v_writelane_b32 v40, s102, 63 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s102, v40, 63 +; MUBUF-NEXT: v_readlane_b32 s101, v40, 62 +; MUBUF-NEXT: v_readlane_b32 s100, v40, 61 +; MUBUF-NEXT: v_readlane_b32 s99, v40, 60 +; MUBUF-NEXT: v_readlane_b32 s98, v40, 59 +; MUBUF-NEXT: v_readlane_b32 s97, v40, 58 +; MUBUF-NEXT: v_readlane_b32 s96, v40, 57 +; MUBUF-NEXT: v_readlane_b32 s95, v40, 56 +; MUBUF-NEXT: v_readlane_b32 s94, v40, 55 +; MUBUF-NEXT: v_readlane_b32 s93, v40, 54 +; MUBUF-NEXT: v_readlane_b32 s92, v40, 53 +; MUBUF-NEXT: v_readlane_b32 s91, v40, 52 +; MUBUF-NEXT: v_readlane_b32 s90, v40, 51 +; MUBUF-NEXT: v_readlane_b32 s89, v40, 50 +; MUBUF-NEXT: v_readlane_b32 s88, v40, 49 +; MUBUF-NEXT: v_readlane_b32 s87, v40, 48 +; MUBUF-NEXT: v_readlane_b32 s86, v40, 47 +; MUBUF-NEXT: v_readlane_b32 s85, v40, 46 +; MUBUF-NEXT: v_readlane_b32 s84, v40, 45 +; MUBUF-NEXT: v_readlane_b32 s83, v40, 44 +; MUBUF-NEXT: v_readlane_b32 s82, v40, 43 +; MUBUF-NEXT: v_readlane_b32 s81, v40, 42 +; MUBUF-NEXT: v_readlane_b32 s80, v40, 41 +; MUBUF-NEXT: v_readlane_b32 s79, v40, 40 +; MUBUF-NEXT: v_readlane_b32 s78, v40, 39 +; MUBUF-NEXT: v_readlane_b32 s77, v40, 38 +; MUBUF-NEXT: v_readlane_b32 s76, v40, 37 +; MUBUF-NEXT: v_readlane_b32 s75, v40, 36 +; MUBUF-NEXT: v_readlane_b32 s74, v40, 35 +; MUBUF-NEXT: v_readlane_b32 s73, v40, 34 +; MUBUF-NEXT: v_readlane_b32 s72, v40, 33 +; MUBUF-NEXT: v_readlane_b32 s71, v40, 32 +; MUBUF-NEXT: v_readlane_b32 s70, v40, 31 +; MUBUF-NEXT: v_readlane_b32 s69, v40, 30 +; MUBUF-NEXT: v_readlane_b32 s68, v40, 29 +; MUBUF-NEXT: v_readlane_b32 s67, v40, 28 +; MUBUF-NEXT: v_readlane_b32 s66, v40, 27 +; MUBUF-NEXT: v_readlane_b32 s65, v40, 26 +; MUBUF-NEXT: v_readlane_b32 s64, v40, 25 +; MUBUF-NEXT: v_readlane_b32 s63, v40, 24 +; MUBUF-NEXT: v_readlane_b32 s62, v40, 23 +; MUBUF-NEXT: v_readlane_b32 s61, v40, 22 +; MUBUF-NEXT: v_readlane_b32 s60, v40, 21 +; MUBUF-NEXT: v_readlane_b32 s59, v40, 20 +; MUBUF-NEXT: v_readlane_b32 s58, v40, 19 +; MUBUF-NEXT: v_readlane_b32 s57, v40, 18 +; MUBUF-NEXT: v_readlane_b32 s56, v40, 17 +; MUBUF-NEXT: v_readlane_b32 s55, v40, 16 +; MUBUF-NEXT: v_readlane_b32 s54, v40, 15 +; MUBUF-NEXT: v_readlane_b32 s53, v40, 14 +; MUBUF-NEXT: v_readlane_b32 s52, v40, 13 +; MUBUF-NEXT: v_readlane_b32 s51, v40, 12 +; MUBUF-NEXT: v_readlane_b32 s50, v40, 11 +; MUBUF-NEXT: v_readlane_b32 s49, v40, 10 +; MUBUF-NEXT: v_readlane_b32 s48, v40, 9 +; MUBUF-NEXT: v_readlane_b32 s47, v40, 8 +; MUBUF-NEXT: v_readlane_b32 s46, v40, 7 +; MUBUF-NEXT: v_readlane_b32 s45, v40, 6 +; MUBUF-NEXT: v_readlane_b32 s44, v40, 5 +; MUBUF-NEXT: v_readlane_b32 s43, v40, 4 +; MUBUF-NEXT: v_readlane_b32 s42, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s41, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s40, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s39, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v41, 0 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_need_to_spill_fp_to_reg: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s39, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s40, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s41, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s42, 3 +; FLATSCR-NEXT: v_writelane_b32 v40, s43, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s44, 5 +; FLATSCR-NEXT: v_writelane_b32 v40, s45, 6 +; FLATSCR-NEXT: v_writelane_b32 v40, s46, 7 +; FLATSCR-NEXT: v_writelane_b32 v40, s47, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s48, 9 +; FLATSCR-NEXT: v_writelane_b32 v40, s49, 10 +; FLATSCR-NEXT: v_writelane_b32 v40, s50, 11 +; FLATSCR-NEXT: v_writelane_b32 v40, s51, 12 +; FLATSCR-NEXT: v_writelane_b32 v40, s52, 13 +; FLATSCR-NEXT: v_writelane_b32 v40, s53, 14 +; FLATSCR-NEXT: v_writelane_b32 v40, s54, 15 +; FLATSCR-NEXT: v_writelane_b32 v40, s55, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s56, 17 +; FLATSCR-NEXT: v_writelane_b32 v40, s57, 18 +; FLATSCR-NEXT: v_writelane_b32 v40, s58, 19 +; FLATSCR-NEXT: v_writelane_b32 v40, s59, 20 +; FLATSCR-NEXT: v_writelane_b32 v40, s60, 21 +; FLATSCR-NEXT: v_writelane_b32 v40, s61, 22 +; FLATSCR-NEXT: v_writelane_b32 v40, s62, 23 +; FLATSCR-NEXT: v_writelane_b32 v40, s63, 24 +; FLATSCR-NEXT: v_writelane_b32 v40, s64, 25 +; FLATSCR-NEXT: v_writelane_b32 v40, s65, 26 +; FLATSCR-NEXT: v_writelane_b32 v40, s66, 27 +; FLATSCR-NEXT: v_writelane_b32 v40, s67, 28 +; FLATSCR-NEXT: v_writelane_b32 v40, s68, 29 +; FLATSCR-NEXT: v_writelane_b32 v40, s69, 30 +; FLATSCR-NEXT: v_writelane_b32 v40, s70, 31 +; FLATSCR-NEXT: v_writelane_b32 v40, s71, 32 +; FLATSCR-NEXT: v_writelane_b32 v40, s72, 33 +; FLATSCR-NEXT: v_writelane_b32 v40, s73, 34 +; FLATSCR-NEXT: v_writelane_b32 v40, s74, 35 +; FLATSCR-NEXT: v_writelane_b32 v40, s75, 36 +; FLATSCR-NEXT: v_writelane_b32 v40, s76, 37 +; FLATSCR-NEXT: v_writelane_b32 v40, s77, 38 +; FLATSCR-NEXT: v_writelane_b32 v40, s78, 39 +; FLATSCR-NEXT: v_writelane_b32 v40, s79, 40 +; FLATSCR-NEXT: v_writelane_b32 v40, s80, 41 +; FLATSCR-NEXT: v_writelane_b32 v40, s81, 42 +; FLATSCR-NEXT: v_writelane_b32 v40, s82, 43 +; FLATSCR-NEXT: v_writelane_b32 v40, s83, 44 +; FLATSCR-NEXT: v_writelane_b32 v40, s84, 45 +; FLATSCR-NEXT: v_writelane_b32 v40, s85, 46 +; FLATSCR-NEXT: v_writelane_b32 v40, s86, 47 +; FLATSCR-NEXT: v_writelane_b32 v40, s87, 48 +; FLATSCR-NEXT: v_writelane_b32 v40, s88, 49 +; FLATSCR-NEXT: v_writelane_b32 v40, s89, 50 +; FLATSCR-NEXT: v_writelane_b32 v40, s90, 51 +; FLATSCR-NEXT: v_writelane_b32 v40, s91, 52 +; FLATSCR-NEXT: v_writelane_b32 v40, s92, 53 +; FLATSCR-NEXT: v_writelane_b32 v40, s93, 54 +; FLATSCR-NEXT: v_writelane_b32 v40, s94, 55 +; FLATSCR-NEXT: v_writelane_b32 v40, s95, 56 +; FLATSCR-NEXT: v_writelane_b32 v40, s96, 57 +; FLATSCR-NEXT: v_writelane_b32 v40, s97, 58 +; FLATSCR-NEXT: v_writelane_b32 v40, s98, 59 +; FLATSCR-NEXT: v_writelane_b32 v40, s99, 60 +; FLATSCR-NEXT: v_writelane_b32 v40, s100, 61 +; FLATSCR-NEXT: v_writelane_b32 v40, s101, 62 +; FLATSCR-NEXT: s_add_i32 s32, s32, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s102, 63 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s102, v40, 63 +; FLATSCR-NEXT: v_readlane_b32 s101, v40, 62 +; FLATSCR-NEXT: v_readlane_b32 s100, v40, 61 +; FLATSCR-NEXT: v_readlane_b32 s99, v40, 60 +; FLATSCR-NEXT: v_readlane_b32 s98, v40, 59 +; FLATSCR-NEXT: v_readlane_b32 s97, v40, 58 +; FLATSCR-NEXT: v_readlane_b32 s96, v40, 57 +; FLATSCR-NEXT: v_readlane_b32 s95, v40, 56 +; FLATSCR-NEXT: v_readlane_b32 s94, v40, 55 +; FLATSCR-NEXT: v_readlane_b32 s93, v40, 54 +; FLATSCR-NEXT: v_readlane_b32 s92, v40, 53 +; FLATSCR-NEXT: v_readlane_b32 s91, v40, 52 +; FLATSCR-NEXT: v_readlane_b32 s90, v40, 51 +; FLATSCR-NEXT: v_readlane_b32 s89, v40, 50 +; FLATSCR-NEXT: v_readlane_b32 s88, v40, 49 +; FLATSCR-NEXT: v_readlane_b32 s87, v40, 48 +; FLATSCR-NEXT: v_readlane_b32 s86, v40, 47 +; FLATSCR-NEXT: v_readlane_b32 s85, v40, 46 +; FLATSCR-NEXT: v_readlane_b32 s84, v40, 45 +; FLATSCR-NEXT: v_readlane_b32 s83, v40, 44 +; FLATSCR-NEXT: v_readlane_b32 s82, v40, 43 +; FLATSCR-NEXT: v_readlane_b32 s81, v40, 42 +; FLATSCR-NEXT: v_readlane_b32 s80, v40, 41 +; FLATSCR-NEXT: v_readlane_b32 s79, v40, 40 +; FLATSCR-NEXT: v_readlane_b32 s78, v40, 39 +; FLATSCR-NEXT: v_readlane_b32 s77, v40, 38 +; FLATSCR-NEXT: v_readlane_b32 s76, v40, 37 +; FLATSCR-NEXT: v_readlane_b32 s75, v40, 36 +; FLATSCR-NEXT: v_readlane_b32 s74, v40, 35 +; FLATSCR-NEXT: v_readlane_b32 s73, v40, 34 +; FLATSCR-NEXT: v_readlane_b32 s72, v40, 33 +; FLATSCR-NEXT: v_readlane_b32 s71, v40, 32 +; FLATSCR-NEXT: v_readlane_b32 s70, v40, 31 +; FLATSCR-NEXT: v_readlane_b32 s69, v40, 30 +; FLATSCR-NEXT: v_readlane_b32 s68, v40, 29 +; FLATSCR-NEXT: v_readlane_b32 s67, v40, 28 +; FLATSCR-NEXT: v_readlane_b32 s66, v40, 27 +; FLATSCR-NEXT: v_readlane_b32 s65, v40, 26 +; FLATSCR-NEXT: v_readlane_b32 s64, v40, 25 +; FLATSCR-NEXT: v_readlane_b32 s63, v40, 24 +; FLATSCR-NEXT: v_readlane_b32 s62, v40, 23 +; FLATSCR-NEXT: v_readlane_b32 s61, v40, 22 +; FLATSCR-NEXT: v_readlane_b32 s60, v40, 21 +; FLATSCR-NEXT: v_readlane_b32 s59, v40, 20 +; FLATSCR-NEXT: v_readlane_b32 s58, v40, 19 +; FLATSCR-NEXT: v_readlane_b32 s57, v40, 18 +; FLATSCR-NEXT: v_readlane_b32 s56, v40, 17 +; FLATSCR-NEXT: v_readlane_b32 s55, v40, 16 +; FLATSCR-NEXT: v_readlane_b32 s54, v40, 15 +; FLATSCR-NEXT: v_readlane_b32 s53, v40, 14 +; FLATSCR-NEXT: v_readlane_b32 s52, v40, 13 +; FLATSCR-NEXT: v_readlane_b32 s51, v40, 12 +; FLATSCR-NEXT: v_readlane_b32 s50, v40, 11 +; FLATSCR-NEXT: v_readlane_b32 s49, v40, 10 +; FLATSCR-NEXT: v_readlane_b32 s48, v40, 9 +; FLATSCR-NEXT: v_readlane_b32 s47, v40, 8 +; FLATSCR-NEXT: v_readlane_b32 s46, v40, 7 +; FLATSCR-NEXT: v_readlane_b32 s45, v40, 6 +; FLATSCR-NEXT: v_readlane_b32 s44, v40, 5 +; FLATSCR-NEXT: v_readlane_b32 s43, v40, 4 +; FLATSCR-NEXT: v_readlane_b32 s42, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s41, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s40, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s39, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} @@ -652,20 +2357,327 @@ define void @callee_need_to_spill_fp_to_reg() #1 { ; If the size of the offset exceeds the MUBUF offset field we need another ; scratch VGPR to hold the offset. -; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset -; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; MUBUF-NEXT: s_mov_b32 s33, s32 -; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 -; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; MUBUF: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] -; GCN-NOT: v_mov_b32_e32 v0, 0x100c -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40200 -; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; FLATSCR: v_mov_b32_e32 v0, 0 -; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000 -; FLATSCR: scratch_store_dword off, v0, [[SOFF]] define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #3 { +; MUBUF-LABEL: spill_fp_to_memory_scratch_reg_needed_mubuf_offset: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: s_add_i32 s5, s33, 0x40100 +; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v39, s39, 0 +; MUBUF-NEXT: v_writelane_b32 v39, s40, 1 +; MUBUF-NEXT: v_writelane_b32 v39, s41, 2 +; MUBUF-NEXT: v_writelane_b32 v39, s42, 3 +; MUBUF-NEXT: v_writelane_b32 v39, s43, 4 +; MUBUF-NEXT: v_writelane_b32 v39, s44, 5 +; MUBUF-NEXT: v_writelane_b32 v39, s45, 6 +; MUBUF-NEXT: v_writelane_b32 v39, s46, 7 +; MUBUF-NEXT: v_writelane_b32 v39, s47, 8 +; MUBUF-NEXT: v_writelane_b32 v39, s48, 9 +; MUBUF-NEXT: v_writelane_b32 v39, s49, 10 +; MUBUF-NEXT: v_writelane_b32 v39, s50, 11 +; MUBUF-NEXT: v_writelane_b32 v39, s51, 12 +; MUBUF-NEXT: v_writelane_b32 v39, s52, 13 +; MUBUF-NEXT: v_writelane_b32 v39, s53, 14 +; MUBUF-NEXT: v_writelane_b32 v39, s54, 15 +; MUBUF-NEXT: v_writelane_b32 v39, s55, 16 +; MUBUF-NEXT: v_writelane_b32 v39, s56, 17 +; MUBUF-NEXT: v_writelane_b32 v39, s57, 18 +; MUBUF-NEXT: v_writelane_b32 v39, s58, 19 +; MUBUF-NEXT: v_writelane_b32 v39, s59, 20 +; MUBUF-NEXT: v_writelane_b32 v39, s60, 21 +; MUBUF-NEXT: v_writelane_b32 v39, s61, 22 +; MUBUF-NEXT: v_writelane_b32 v39, s62, 23 +; MUBUF-NEXT: v_writelane_b32 v39, s63, 24 +; MUBUF-NEXT: v_writelane_b32 v39, s64, 25 +; MUBUF-NEXT: v_writelane_b32 v39, s65, 26 +; MUBUF-NEXT: v_writelane_b32 v39, s66, 27 +; MUBUF-NEXT: v_writelane_b32 v39, s67, 28 +; MUBUF-NEXT: v_writelane_b32 v39, s68, 29 +; MUBUF-NEXT: v_writelane_b32 v39, s69, 30 +; MUBUF-NEXT: v_writelane_b32 v39, s70, 31 +; MUBUF-NEXT: v_writelane_b32 v39, s71, 32 +; MUBUF-NEXT: v_writelane_b32 v39, s72, 33 +; MUBUF-NEXT: v_writelane_b32 v39, s73, 34 +; MUBUF-NEXT: v_writelane_b32 v39, s74, 35 +; MUBUF-NEXT: v_writelane_b32 v39, s75, 36 +; MUBUF-NEXT: v_writelane_b32 v39, s76, 37 +; MUBUF-NEXT: v_writelane_b32 v39, s77, 38 +; MUBUF-NEXT: v_writelane_b32 v39, s78, 39 +; MUBUF-NEXT: v_writelane_b32 v39, s79, 40 +; MUBUF-NEXT: v_writelane_b32 v39, s80, 41 +; MUBUF-NEXT: v_writelane_b32 v39, s81, 42 +; MUBUF-NEXT: v_writelane_b32 v39, s82, 43 +; MUBUF-NEXT: v_writelane_b32 v39, s83, 44 +; MUBUF-NEXT: v_writelane_b32 v39, s84, 45 +; MUBUF-NEXT: v_writelane_b32 v39, s85, 46 +; MUBUF-NEXT: v_writelane_b32 v39, s86, 47 +; MUBUF-NEXT: v_writelane_b32 v39, s87, 48 +; MUBUF-NEXT: v_writelane_b32 v39, s88, 49 +; MUBUF-NEXT: v_writelane_b32 v39, s89, 50 +; MUBUF-NEXT: v_writelane_b32 v39, s90, 51 +; MUBUF-NEXT: v_writelane_b32 v39, s91, 52 +; MUBUF-NEXT: v_writelane_b32 v39, s92, 53 +; MUBUF-NEXT: v_writelane_b32 v39, s93, 54 +; MUBUF-NEXT: v_writelane_b32 v39, s94, 55 +; MUBUF-NEXT: v_writelane_b32 v39, s95, 56 +; MUBUF-NEXT: v_writelane_b32 v39, s96, 57 +; MUBUF-NEXT: v_writelane_b32 v39, s97, 58 +; MUBUF-NEXT: v_writelane_b32 v39, s98, 59 +; MUBUF-NEXT: v_writelane_b32 v39, s99, 60 +; MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; MUBUF-NEXT: s_add_i32 s5, s33, 0x40200 +; MUBUF-NEXT: v_writelane_b32 v39, s100, 61 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v39, s101, 62 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1000 +; MUBUF-NEXT: v_writelane_b32 v39, s102, 63 +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber all VGPRs except CSR v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_add_i32 s5, s33, 0x40200 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload +; MUBUF-NEXT: s_add_i32 s32, s32, 0x40400 +; MUBUF-NEXT: v_readlane_b32 s102, v39, 63 +; MUBUF-NEXT: v_readlane_b32 s101, v39, 62 +; MUBUF-NEXT: v_readlane_b32 s100, v39, 61 +; MUBUF-NEXT: v_readlane_b32 s99, v39, 60 +; MUBUF-NEXT: v_readlane_b32 s98, v39, 59 +; MUBUF-NEXT: v_readlane_b32 s97, v39, 58 +; MUBUF-NEXT: v_readlane_b32 s96, v39, 57 +; MUBUF-NEXT: v_readlane_b32 s95, v39, 56 +; MUBUF-NEXT: v_readlane_b32 s94, v39, 55 +; MUBUF-NEXT: v_readlane_b32 s93, v39, 54 +; MUBUF-NEXT: v_readlane_b32 s92, v39, 53 +; MUBUF-NEXT: v_readlane_b32 s91, v39, 52 +; MUBUF-NEXT: v_readlane_b32 s90, v39, 51 +; MUBUF-NEXT: v_readlane_b32 s89, v39, 50 +; MUBUF-NEXT: v_readlane_b32 s88, v39, 49 +; MUBUF-NEXT: v_readlane_b32 s87, v39, 48 +; MUBUF-NEXT: v_readlane_b32 s86, v39, 47 +; MUBUF-NEXT: v_readlane_b32 s85, v39, 46 +; MUBUF-NEXT: v_readlane_b32 s84, v39, 45 +; MUBUF-NEXT: v_readlane_b32 s83, v39, 44 +; MUBUF-NEXT: v_readlane_b32 s82, v39, 43 +; MUBUF-NEXT: v_readlane_b32 s81, v39, 42 +; MUBUF-NEXT: v_readlane_b32 s80, v39, 41 +; MUBUF-NEXT: v_readlane_b32 s79, v39, 40 +; MUBUF-NEXT: v_readlane_b32 s78, v39, 39 +; MUBUF-NEXT: v_readlane_b32 s77, v39, 38 +; MUBUF-NEXT: v_readlane_b32 s76, v39, 37 +; MUBUF-NEXT: v_readlane_b32 s75, v39, 36 +; MUBUF-NEXT: v_readlane_b32 s74, v39, 35 +; MUBUF-NEXT: v_readlane_b32 s73, v39, 34 +; MUBUF-NEXT: v_readlane_b32 s72, v39, 33 +; MUBUF-NEXT: v_readlane_b32 s71, v39, 32 +; MUBUF-NEXT: v_readlane_b32 s70, v39, 31 +; MUBUF-NEXT: v_readlane_b32 s69, v39, 30 +; MUBUF-NEXT: v_readlane_b32 s68, v39, 29 +; MUBUF-NEXT: v_readlane_b32 s67, v39, 28 +; MUBUF-NEXT: v_readlane_b32 s66, v39, 27 +; MUBUF-NEXT: v_readlane_b32 s65, v39, 26 +; MUBUF-NEXT: v_readlane_b32 s64, v39, 25 +; MUBUF-NEXT: v_readlane_b32 s63, v39, 24 +; MUBUF-NEXT: v_readlane_b32 s62, v39, 23 +; MUBUF-NEXT: v_readlane_b32 s61, v39, 22 +; MUBUF-NEXT: v_readlane_b32 s60, v39, 21 +; MUBUF-NEXT: v_readlane_b32 s59, v39, 20 +; MUBUF-NEXT: v_readlane_b32 s58, v39, 19 +; MUBUF-NEXT: v_readlane_b32 s57, v39, 18 +; MUBUF-NEXT: v_readlane_b32 s56, v39, 17 +; MUBUF-NEXT: v_readlane_b32 s55, v39, 16 +; MUBUF-NEXT: v_readlane_b32 s54, v39, 15 +; MUBUF-NEXT: v_readlane_b32 s53, v39, 14 +; MUBUF-NEXT: v_readlane_b32 s52, v39, 13 +; MUBUF-NEXT: v_readlane_b32 s51, v39, 12 +; MUBUF-NEXT: v_readlane_b32 s50, v39, 11 +; MUBUF-NEXT: v_readlane_b32 s49, v39, 10 +; MUBUF-NEXT: v_readlane_b32 s48, v39, 9 +; MUBUF-NEXT: v_readlane_b32 s47, v39, 8 +; MUBUF-NEXT: v_readlane_b32 s46, v39, 7 +; MUBUF-NEXT: v_readlane_b32 s45, v39, 6 +; MUBUF-NEXT: v_readlane_b32 s44, v39, 5 +; MUBUF-NEXT: v_readlane_b32 s43, v39, 4 +; MUBUF-NEXT: v_readlane_b32 s42, v39, 3 +; MUBUF-NEXT: v_readlane_b32 s41, v39, 2 +; MUBUF-NEXT: v_readlane_b32 s40, v39, 1 +; MUBUF-NEXT: v_readlane_b32 s39, v39, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_readfirstlane_b32 s4, v0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: s_add_i32 s5, s33, 0x40100 +; MUBUF-NEXT: buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: spill_fp_to_memory_scratch_reg_needed_mubuf_offset: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: s_add_i32 s1, s33, 0x1004 +; FLATSCR-NEXT: scratch_store_dword off, v39, s1 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v39, s39, 0 +; FLATSCR-NEXT: v_writelane_b32 v39, s40, 1 +; FLATSCR-NEXT: v_writelane_b32 v39, s41, 2 +; FLATSCR-NEXT: v_writelane_b32 v39, s42, 3 +; FLATSCR-NEXT: v_writelane_b32 v39, s43, 4 +; FLATSCR-NEXT: v_writelane_b32 v39, s44, 5 +; FLATSCR-NEXT: v_writelane_b32 v39, s45, 6 +; FLATSCR-NEXT: v_writelane_b32 v39, s46, 7 +; FLATSCR-NEXT: v_writelane_b32 v39, s47, 8 +; FLATSCR-NEXT: v_writelane_b32 v39, s48, 9 +; FLATSCR-NEXT: v_writelane_b32 v39, s49, 10 +; FLATSCR-NEXT: v_writelane_b32 v39, s50, 11 +; FLATSCR-NEXT: v_writelane_b32 v39, s51, 12 +; FLATSCR-NEXT: v_writelane_b32 v39, s52, 13 +; FLATSCR-NEXT: v_writelane_b32 v39, s53, 14 +; FLATSCR-NEXT: v_writelane_b32 v39, s54, 15 +; FLATSCR-NEXT: v_writelane_b32 v39, s55, 16 +; FLATSCR-NEXT: v_writelane_b32 v39, s56, 17 +; FLATSCR-NEXT: v_writelane_b32 v39, s57, 18 +; FLATSCR-NEXT: v_writelane_b32 v39, s58, 19 +; FLATSCR-NEXT: v_writelane_b32 v39, s59, 20 +; FLATSCR-NEXT: v_writelane_b32 v39, s60, 21 +; FLATSCR-NEXT: v_writelane_b32 v39, s61, 22 +; FLATSCR-NEXT: v_writelane_b32 v39, s62, 23 +; FLATSCR-NEXT: v_writelane_b32 v39, s63, 24 +; FLATSCR-NEXT: v_writelane_b32 v39, s64, 25 +; FLATSCR-NEXT: v_writelane_b32 v39, s65, 26 +; FLATSCR-NEXT: v_writelane_b32 v39, s66, 27 +; FLATSCR-NEXT: v_writelane_b32 v39, s67, 28 +; FLATSCR-NEXT: v_writelane_b32 v39, s68, 29 +; FLATSCR-NEXT: v_writelane_b32 v39, s69, 30 +; FLATSCR-NEXT: v_writelane_b32 v39, s70, 31 +; FLATSCR-NEXT: v_writelane_b32 v39, s71, 32 +; FLATSCR-NEXT: v_writelane_b32 v39, s72, 33 +; FLATSCR-NEXT: v_writelane_b32 v39, s73, 34 +; FLATSCR-NEXT: v_writelane_b32 v39, s74, 35 +; FLATSCR-NEXT: v_writelane_b32 v39, s75, 36 +; FLATSCR-NEXT: v_writelane_b32 v39, s76, 37 +; FLATSCR-NEXT: v_writelane_b32 v39, s77, 38 +; FLATSCR-NEXT: v_writelane_b32 v39, s78, 39 +; FLATSCR-NEXT: v_writelane_b32 v39, s79, 40 +; FLATSCR-NEXT: v_writelane_b32 v39, s80, 41 +; FLATSCR-NEXT: v_writelane_b32 v39, s81, 42 +; FLATSCR-NEXT: v_writelane_b32 v39, s82, 43 +; FLATSCR-NEXT: v_writelane_b32 v39, s83, 44 +; FLATSCR-NEXT: v_writelane_b32 v39, s84, 45 +; FLATSCR-NEXT: v_writelane_b32 v39, s85, 46 +; FLATSCR-NEXT: v_writelane_b32 v39, s86, 47 +; FLATSCR-NEXT: v_writelane_b32 v39, s87, 48 +; FLATSCR-NEXT: v_writelane_b32 v39, s88, 49 +; FLATSCR-NEXT: v_writelane_b32 v39, s89, 50 +; FLATSCR-NEXT: v_writelane_b32 v39, s90, 51 +; FLATSCR-NEXT: v_writelane_b32 v39, s91, 52 +; FLATSCR-NEXT: v_writelane_b32 v39, s92, 53 +; FLATSCR-NEXT: v_writelane_b32 v39, s93, 54 +; FLATSCR-NEXT: v_writelane_b32 v39, s94, 55 +; FLATSCR-NEXT: v_writelane_b32 v39, s95, 56 +; FLATSCR-NEXT: v_writelane_b32 v39, s96, 57 +; FLATSCR-NEXT: v_writelane_b32 v39, s97, 58 +; FLATSCR-NEXT: v_writelane_b32 v39, s98, 59 +; FLATSCR-NEXT: v_writelane_b32 v39, s99, 60 +; FLATSCR-NEXT: s_addk_i32 s32, 0x100c +; FLATSCR-NEXT: v_writelane_b32 v39, s100, 61 +; FLATSCR-NEXT: v_writelane_b32 v39, s101, 62 +; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; FLATSCR-NEXT: s_add_i32 s1, s33, 0x1000 +; FLATSCR-NEXT: v_writelane_b32 v39, s102, 63 +; FLATSCR-NEXT: scratch_store_dword off, v0, s1 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber all VGPRs except CSR v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s102, v39, 63 +; FLATSCR-NEXT: v_readlane_b32 s101, v39, 62 +; FLATSCR-NEXT: v_readlane_b32 s100, v39, 61 +; FLATSCR-NEXT: v_readlane_b32 s99, v39, 60 +; FLATSCR-NEXT: v_readlane_b32 s98, v39, 59 +; FLATSCR-NEXT: v_readlane_b32 s97, v39, 58 +; FLATSCR-NEXT: v_readlane_b32 s96, v39, 57 +; FLATSCR-NEXT: v_readlane_b32 s95, v39, 56 +; FLATSCR-NEXT: v_readlane_b32 s94, v39, 55 +; FLATSCR-NEXT: v_readlane_b32 s93, v39, 54 +; FLATSCR-NEXT: v_readlane_b32 s92, v39, 53 +; FLATSCR-NEXT: v_readlane_b32 s91, v39, 52 +; FLATSCR-NEXT: v_readlane_b32 s90, v39, 51 +; FLATSCR-NEXT: v_readlane_b32 s89, v39, 50 +; FLATSCR-NEXT: v_readlane_b32 s88, v39, 49 +; FLATSCR-NEXT: v_readlane_b32 s87, v39, 48 +; FLATSCR-NEXT: v_readlane_b32 s86, v39, 47 +; FLATSCR-NEXT: v_readlane_b32 s85, v39, 46 +; FLATSCR-NEXT: v_readlane_b32 s84, v39, 45 +; FLATSCR-NEXT: v_readlane_b32 s83, v39, 44 +; FLATSCR-NEXT: v_readlane_b32 s82, v39, 43 +; FLATSCR-NEXT: v_readlane_b32 s81, v39, 42 +; FLATSCR-NEXT: v_readlane_b32 s80, v39, 41 +; FLATSCR-NEXT: v_readlane_b32 s79, v39, 40 +; FLATSCR-NEXT: v_readlane_b32 s78, v39, 39 +; FLATSCR-NEXT: v_readlane_b32 s77, v39, 38 +; FLATSCR-NEXT: v_readlane_b32 s76, v39, 37 +; FLATSCR-NEXT: v_readlane_b32 s75, v39, 36 +; FLATSCR-NEXT: v_readlane_b32 s74, v39, 35 +; FLATSCR-NEXT: v_readlane_b32 s73, v39, 34 +; FLATSCR-NEXT: v_readlane_b32 s72, v39, 33 +; FLATSCR-NEXT: v_readlane_b32 s71, v39, 32 +; FLATSCR-NEXT: v_readlane_b32 s70, v39, 31 +; FLATSCR-NEXT: v_readlane_b32 s69, v39, 30 +; FLATSCR-NEXT: v_readlane_b32 s68, v39, 29 +; FLATSCR-NEXT: v_readlane_b32 s67, v39, 28 +; FLATSCR-NEXT: v_readlane_b32 s66, v39, 27 +; FLATSCR-NEXT: v_readlane_b32 s65, v39, 26 +; FLATSCR-NEXT: v_readlane_b32 s64, v39, 25 +; FLATSCR-NEXT: v_readlane_b32 s63, v39, 24 +; FLATSCR-NEXT: v_readlane_b32 s62, v39, 23 +; FLATSCR-NEXT: v_readlane_b32 s61, v39, 22 +; FLATSCR-NEXT: v_readlane_b32 s60, v39, 21 +; FLATSCR-NEXT: v_readlane_b32 s59, v39, 20 +; FLATSCR-NEXT: v_readlane_b32 s58, v39, 19 +; FLATSCR-NEXT: v_readlane_b32 s57, v39, 18 +; FLATSCR-NEXT: v_readlane_b32 s56, v39, 17 +; FLATSCR-NEXT: v_readlane_b32 s55, v39, 16 +; FLATSCR-NEXT: v_readlane_b32 s54, v39, 15 +; FLATSCR-NEXT: v_readlane_b32 s53, v39, 14 +; FLATSCR-NEXT: v_readlane_b32 s52, v39, 13 +; FLATSCR-NEXT: v_readlane_b32 s51, v39, 12 +; FLATSCR-NEXT: v_readlane_b32 s50, v39, 11 +; FLATSCR-NEXT: v_readlane_b32 s49, v39, 10 +; FLATSCR-NEXT: v_readlane_b32 s48, v39, 9 +; FLATSCR-NEXT: v_readlane_b32 s47, v39, 8 +; FLATSCR-NEXT: v_readlane_b32 s46, v39, 7 +; FLATSCR-NEXT: v_readlane_b32 s45, v39, 6 +; FLATSCR-NEXT: v_readlane_b32 s44, v39, 5 +; FLATSCR-NEXT: v_readlane_b32 s43, v39, 4 +; FLATSCR-NEXT: v_readlane_b32 s42, v39, 3 +; FLATSCR-NEXT: v_readlane_b32 s41, v39, 2 +; FLATSCR-NEXT: v_readlane_b32 s40, v39, 1 +; FLATSCR-NEXT: v_readlane_b32 s39, v39, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: s_add_i32 s1, s33, 0x1004 +; FLATSCR-NEXT: scratch_load_dword v39, off, s1 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll index 593f40fd1b25e..31e520ce74d98 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s @@ -7,45 +8,76 @@ declare void @external_void_func_i32(i32) #0 -; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm: -; GCN: s_waitcnt - ; Spill CSR VGPR used for SGPR spilling -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2 -; GCN-DAG: v_writelane_b32 v40, s30, 0 -; GCN-DAG: v_writelane_b32 v40, s31, 1 - -; GCN: s_swappc_b64 - -; GCN: v_readlane_b32 s31, v40, 1 -; GCN: v_readlane_b32 s30, v40, 0 -; GCN: s_mov_b32 s32, s33 - -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] define void @test_func_call_external_void_func_i32_imm() #0 { +; GCN-LABEL: test_func_call_external_void_func_i32_imm: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s16, 2 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, external_void_func_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, external_void_func_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 42 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_i32(i32 42) ret void } -; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use: -; GCN: s_waitcnt -; GCN: s_mov_b32 s33, s32 -; GCN-DAG: s_addk_i32 s32, 0x1400{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: -; GCN: s_swappc_b64 -; GCN: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { +; GCN-LABEL: test_func_call_external_void_func_i32_imm_stack_use: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: s_addk_i32 s32, 0x1400 +; GCN-NEXT: v_writelane_b32 v40, s16, 2 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, external_void_func_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, external_void_func_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 42 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [16 x i32], align 4, addrspace(5) %gep15 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 16 store volatile i32 0, ptr addrspace(5) %alloca diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 0676bc79a46f5..d09fc947bac18 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -1,29 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s target datalayout = "A5" ; FIXME: Why is this commuted only sometimes? -; GCN-LABEL: {{^}}i32_fastcc_i32_i32: -; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { +; GFX9-LABEL: i32_fastcc_i32_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] %add0 = add i32 %arg0, %arg1 ret i32 %add0 } -; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: -; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9 -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20 -; GCN: s_waitcnt vmcnt(0) -; GCN: s_setpc_b64 -; GCN: ; ScratchSize: 68 define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { +; GFX9-LABEL: i32_fastcc_i32_i32_stack_object: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 68 %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 store volatile i32 9, ptr addrspace(5) %gep @@ -31,19 +32,34 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { ret i32 %add0 } -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32: define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret i32 %ret } -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object: -; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20 -; GCN: s_setpc_b64 -; GCN: ; ScratchSize: 68 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN: ; ScratchSize: 68 entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -52,12 +68,19 @@ entry: ret i32 %ret } -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object: -; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20 -; GCN: s_setpc_b64 -; GCN: ; ScratchSize: 136 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN: ; ScratchSize: 136 entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -66,45 +89,108 @@ entry: ret i32 %ret } -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result: define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret void } ; It doesn't make sense to do a tail from a kernel -; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result: -;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { +; CIVI-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_add_i32 s6, s6, s9 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; CIVI-NEXT: s_add_u32 s0, s0, s9 +; CIVI-NEXT: s_addc_u32 s1, s1, 0 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CIVI-NEXT: s_getpc_b64 s[6:7] +; CIVI-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CIVI-NEXT: s_mov_b32 s32, 0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s4 +; CIVI-NEXT: v_mov_b32_e32 v1, s5 +; CIVI-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CIVI-NEXT: s_endpgm +; +; GFX9-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_endpgm entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret void } -; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32: -; GCN: s_waitcnt -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) - -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 - -; GCN-NEXT: s_setpc_b64 s[30:31] define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) byval(i32) align 4 %arg1) #1 { +; GFX9-LABEL: i32_fastcc_i32_byval_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] %arg1.load = load i32, ptr addrspace(5) %arg1, align 4 %add0 = add i32 %arg0, %arg1.load ret i32 %add0 } ; Tail call disallowed with byval in parent. -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent: -; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} -; GCN: s_swappc_b64 -; GCN-NOT: v_readlane_b32 s32 -; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, ptr addrspace(5) byval(i32) %b.byval, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) %b.byval) ret i32 %ret @@ -113,34 +199,32 @@ entry: ; Tail call disallowed with byval in parent, not callee. The stack ; usage of incoming arguments must be <= the outgoing stack ; arguments. - -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: -; GCN-NOT: v0 -; GCN-NOT: s32 -; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16 -; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} -; GCN-NEXT: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) inttoptr (i32 16 to ptr addrspace(5))) ret i32 %ret } -; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: -; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} - -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 -; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] -; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]] - - -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9: v_add3_u32 v0, v0, v3, v2 - -; GCN-NEXT: s_setpc_b64 define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { +; GFX9-LABEL: i32_fastcc_i32_i32_a32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add3_u32 v0, v0, v3, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] %val_firststack = extractvalue [32 x i32] %large, 30 %val_laststack = extractvalue [32 x i32] %large, 31 %add0 = add i32 %arg0, %arg1 @@ -150,31 +234,49 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l } ; FIXME: Why load and store same location for stack args? -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: - -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD_2:v[0-9]+]], off, s[0:3], s32 offset:8 - -; GCN-NOT: s32 - -; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword [[LOAD_2]], off, s[0:3], s32 offset:8 - -; GCN-NOT: s32 -; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) ret i32 %ret } -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: -; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32 -; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v34, 9 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -186,54 +288,114 @@ entry: ; If the callee requires more stack argument space than the caller, ; don't do a tail call. ; TODO: Do we really need this restriction? - -; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space: -; GCN: s_swappc_b64 -; GCN: s_setpc_b64 define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { +; GCN-LABEL: no_sibling_call_callee_more_stack_space: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) ret i32 %ret } ; Have another non-tail in the function -; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec -; GCN-DAG: s_addk_i32 s32, 0x400 -; GCN: v_writelane_b32 [[CSRV]], [[FP_SCRATCH_COPY]], 2 - -; GCN-DAG: s_getpc_b64 s[4:5] -; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 - -; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0 -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1 - - -; GCN: s_swappc_b64 - -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload - -; GCN: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1 -; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0 -; GCN-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v42, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v42, s30, 0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v42, s31, 1 +; GCN-NEXT: v_mov_b32_e32 v40, v1 +; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v41 +; GCN-NEXT: v_mov_b32_e32 v1, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 +; GCN-NEXT: v_readlane_b32 s31, v42, 1 +; GCN-NEXT: v_readlane_b32 s30, v42, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s6, v42, 2 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call) @@ -242,16 +404,25 @@ entry: ; Have stack object in caller and stack passed arguments. SP should be ; in same place at function exit. - -; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: -; GCN-NOT: s33 -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset: - -; GCN-NOT: s33 - -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: -; GCN: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { +; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v34, 9 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -260,13 +431,52 @@ entry: ret i32 %ret } -; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: -; GCN-NOT: s33 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48 - -; GCN-NOT: s33 -; GCN: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { +; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -278,11 +488,18 @@ entry: @func_ptr_gv = external unnamed_addr addrspace(4) constant ptr, align 4 ; Do support tail calls with a uniform, but unknown, callee. -; GCN-LABEL: {{^}}indirect_uniform_sibling_call_i32_fastcc_i32_i32: -; GCN: s_load_dwordx2 [[GV_ADDR:s\[[0-9]+:[0-9]+\]]] -; GCN: s_load_dwordx2 [[FUNC_PTR:s\[[0-9]+:[0-9]+\]]], [[GV_ADDR]] -; GCN: s_setpc_b64 [[FUNC_PTR]] define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { +; GCN-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %func.ptr.load = load ptr, ptr addrspace(4) @func_ptr_gv %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b) @@ -291,14 +508,97 @@ entry: ; We can't support a tail call to a divergent target. Use a waterfall ; loop around a regular call -; GCN-LABEL: {{^}}indirect_divergent_sibling_call_i32_fastcc_i32_i32: -; GCN: v_readfirstlane_b32 -; GCN: v_readfirstlane_b32 -; GCN: s_and_saveexec_b64 -; GCN: s_swappc_b64 -; GCN: s_cbranch_execnz -; GCN: s_setpc_b64 define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr %func.ptr, i32 %a, i32 %b, i32 %c) #1 { +; GFX9-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-NEXT: v_writelane_b32 v40, s16, 18 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s35, 3 +; GFX9-NEXT: v_writelane_b32 v40, s36, 4 +; GFX9-NEXT: v_writelane_b32 v40, s37, 5 +; GFX9-NEXT: v_writelane_b32 v40, s38, 6 +; GFX9-NEXT: v_writelane_b32 v40, s39, 7 +; GFX9-NEXT: v_writelane_b32 v40, s40, 8 +; GFX9-NEXT: v_writelane_b32 v40, s41, 9 +; GFX9-NEXT: v_writelane_b32 v40, s42, 10 +; GFX9-NEXT: v_writelane_b32 v40, s43, 11 +; GFX9-NEXT: v_writelane_b32 v40, s44, 12 +; GFX9-NEXT: v_writelane_b32 v40, s45, 13 +; GFX9-NEXT: v_writelane_b32 v40, s46, 14 +; GFX9-NEXT: v_writelane_b32 v40, s47, 15 +; GFX9-NEXT: v_writelane_b32 v40, s48, 16 +; GFX9-NEXT: s_mov_b32 s42, s15 +; GFX9-NEXT: s_mov_b32 s43, s14 +; GFX9-NEXT: s_mov_b32 s44, s13 +; GFX9-NEXT: s_mov_b32 s45, s12 +; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11] +; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9] +; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7] +; GFX9-NEXT: s_mov_b64 s[40:41], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: s_mov_b64 s[46:47], exec +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s49, 17 +; GFX9-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_readfirstlane_b32 s16, v0 +; GFX9-NEXT: v_readfirstlane_b32 s17, v1 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s45 +; GFX9-NEXT: s_mov_b32 s13, s44 +; GFX9-NEXT: s_mov_b32 s14, s43 +; GFX9-NEXT: s_mov_b32 s15, s42 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_xor_b64 exec, exec, s[48:49] +; GFX9-NEXT: s_cbranch_execnz .LBB18_1 +; GFX9-NEXT: ; %bb.2: +; GFX9-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_readlane_b32 s49, v40, 17 +; GFX9-NEXT: v_readlane_b32 s48, v40, 16 +; GFX9-NEXT: v_readlane_b32 s47, v40, 15 +; GFX9-NEXT: v_readlane_b32 s46, v40, 14 +; GFX9-NEXT: v_readlane_b32 s45, v40, 13 +; GFX9-NEXT: v_readlane_b32 s44, v40, 12 +; GFX9-NEXT: v_readlane_b32 s43, v40, 11 +; GFX9-NEXT: v_readlane_b32 s42, v40, 10 +; GFX9-NEXT: v_readlane_b32 s41, v40, 9 +; GFX9-NEXT: v_readlane_b32 s40, v40, 8 +; GFX9-NEXT: v_readlane_b32 s39, v40, 7 +; GFX9-NEXT: v_readlane_b32 s38, v40, 6 +; GFX9-NEXT: v_readlane_b32 s37, v40, 5 +; GFX9-NEXT: v_readlane_b32 s36, v40, 4 +; GFX9-NEXT: v_readlane_b32 s35, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: v_readlane_b32 s4, v40, 18 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] entry: %add = add i32 %b, %c %ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add) @@ -307,30 +607,30 @@ entry: declare hidden void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) align 16, ptr addrspace(5) byval([2 x i64])) -; GCN-LABEL: {{^}}sibling_call_fastcc_multi_byval: -; GCN-DAG: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]] -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 - -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152 - -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}} - -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:160 -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:164 -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:168 -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:172 -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:28{{$}} - -; GCN: s_setpc_b64 [[TARGET_ADDR]] define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { +; GCN-LABEL: sibling_call_fastcc_multi_byval: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 9 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %alloca0 = alloca [3 x i32], align 16, addrspace(5) %alloca1 = alloca [2 x i64], align 8, addrspace(5) @@ -343,26 +643,55 @@ entry: declare hidden void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([3 x i32]) align 16, [32 x i32], i32) ; Callee has a byval and non-byval stack passed argument -; GCN-LABEL: {{^}}sibling_call_byval_and_stack_passed: -; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 - -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:16 - -; GCN: v_mov_b32_e32 v0, 0 -; GCN: v_mov_b32_e32 v30, 0 - -; GCN: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]] -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 [[TARGET_ADDR]] define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 { +; GCN-LABEL: sibling_call_byval_and_stack_passed: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %alloca = alloca [3 x i32], align 16, addrspace(5) store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca @@ -372,13 +701,14 @@ entry: declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0) -; GCN-LABEL: {{^}}sibling_call_i64_fastcc_i64: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 { +; GCN-LABEL: sibling_call_i64_fastcc_i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a) ret i64 %ret @@ -386,13 +716,14 @@ entry: declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0) -; GCN-LABEL: {{^}}sibling_call_p1i8_fastcc_p1i8: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 { +; GCN-LABEL: sibling_call_p1i8_fastcc_p1i8: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a) ret ptr addrspace(1) %ret @@ -400,13 +731,14 @@ entry: declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0) -; GCN-LABEL: {{^}}sibling_call_i16_fastcc_i16: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 { +; GCN-LABEL: sibling_call_i16_fastcc_i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a) ret i16 %ret @@ -414,13 +746,14 @@ entry: declare hidden fastcc half @f16_fastcc_f16(half %arg0) -; GCN-LABEL: {{^}}sibling_call_f16_fastcc_f16: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 { +; GCN-LABEL: sibling_call_f16_fastcc_f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc half @f16_fastcc_f16(half %a) ret half %ret @@ -428,13 +761,14 @@ entry: declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0) -; GCN-LABEL: {{^}}sibling_call_v3i16_fastcc_v3i16: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 { +; GCN-LABEL: sibling_call_v3i16_fastcc_v3i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a) ret <3 x i16> %ret @@ -442,13 +776,14 @@ entry: declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0) -; GCN-LABEL: {{^}}sibling_call_v4i16_fastcc_v4i16: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 { +; GCN-LABEL: sibling_call_v4i16_fastcc_v4i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a) ret <4 x i16> %ret @@ -456,13 +791,14 @@ entry: declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0) -; GCN-LABEL: {{^}}sibling_call_v2i64_fastcc_v2i64: -; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 { +; GCN-LABEL: sibling_call_v2i64_fastcc_v2i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a) ret <2 x i64> %ret From 4b16622a2c124927b8c454ddb43f163a937159b8 Mon Sep 17 00:00:00 2001 From: easyonaadit Date: Mon, 27 Jan 2025 11:40:29 +0530 Subject: [PATCH 2/4] Split run-lines --- llvm/test/CodeGen/AMDGPU/sibling-call.ll | 1142 +++++++++++++++++++++- 1 file changed, 1136 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index d09fc947bac18..e20248426324f 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -1,11 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s target datalayout = "A5" ; FIXME: Why is this commuted only sometimes? define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { +; CIVI-LABEL: i32_fastcc_i32_i32: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; CIVI-NEXT: s_setpc_b64 s[30:31] +; +; GCN-LABEL: i32_fastcc_i32_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: i32_fastcc_i32_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -16,6 +28,24 @@ define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { } define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { +; CIVI-LABEL: i32_fastcc_i32_i32_stack_object: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v2, 9 +; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] +; +; GCN-LABEL: i32_fastcc_i32_i32_stack_object: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, 9 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: i32_fastcc_i32_i32_stack_object: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -24,7 +54,6 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GCN: ; ScratchSize: 68 %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 store volatile i32 9, ptr addrspace(5) %gep @@ -33,6 +62,16 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { } define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { +; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[4:5] +; ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -42,12 +81,34 @@ define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret i32 %ret } define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { +; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CIVI-NEXT: v_mov_b32_e32 v2, 9 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; CIVI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[4:5] +; ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -59,7 +120,18 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] -; GCN: ; ScratchSize: 68 +; +; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -69,6 +141,18 @@ entry: } define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { +; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CIVI-NEXT: v_mov_b32_e32 v2, 9 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; CIVI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[4:5] +; ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -80,7 +164,18 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] -; GCN: ; ScratchSize: 136 +; +; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -90,6 +185,16 @@ entry: } define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { +; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[4:5] +; ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -99,6 +204,16 @@ define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret void @@ -125,6 +240,25 @@ define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, ; CIVI-NEXT: s_swappc_b64 s[30:31], s[6:7] ; CIVI-NEXT: s_endpgm ; +; GCN-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_endpgm +; ; GFX9-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 @@ -148,6 +282,22 @@ entry: } define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) byval(i32) align 4 %arg1) #1 { +; CIVI-LABEL: i32_fastcc_i32_byval_i32: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; CIVI-NEXT: s_setpc_b64 s[30:31] +; +; GCN-LABEL: i32_fastcc_i32_byval_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: i32_fastcc_i32_byval_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -162,6 +312,36 @@ define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) b ; Tail call disallowed with byval in parent. define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, ptr addrspace(5) byval(i32) %b.byval, i32 %c) #1 { +; CIVI-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 s4, s33 +; CIVI-NEXT: s_mov_b32 s33, s32 +; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CIVI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CIVI-NEXT: s_mov_b64 exec, s[6:7] +; CIVI-NEXT: buffer_load_dword v1, off, s[0:3], s33 +; CIVI-NEXT: v_writelane_b32 v40, s4, 2 +; CIVI-NEXT: s_addk_i32 s32, 0x400 +; CIVI-NEXT: v_writelane_b32 v40, s30, 0 +; CIVI-NEXT: v_writelane_b32 v40, s31, 1 +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; CIVI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CIVI-NEXT: v_readlane_b32 s31, v40, 1 +; CIVI-NEXT: v_readlane_b32 s30, v40, 0 +; CIVI-NEXT: s_mov_b32 s32, s33 +; CIVI-NEXT: v_readlane_b32 s4, v40, 2 +; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CIVI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CIVI-NEXT: s_mov_b64 exec, s[6:7] +; CIVI-NEXT: s_mov_b32 s33, s4 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] +; ; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -191,6 +371,36 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, pt ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 +; GFX9-NEXT: v_writelane_b32 v40, s4, 2 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) %b.byval) ret i32 %ret @@ -200,6 +410,17 @@ entry: ; usage of incoming arguments must be <= the outgoing stack ; arguments. define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 { +; CIVI-LABEL: sibling_call_i32_fastcc_i32_byval_i32: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; CIVI-NEXT: s_setpc_b64 s[4:5] +; ; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -210,12 +431,47 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %lar ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GCN-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: sibling_call_i32_fastcc_i32_byval_i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX9-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) inttoptr (i32 16 to ptr addrspace(5))) ret i32 %ret } define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { +; CIVI-LABEL: i32_fastcc_i32_i32_a32i32: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; CIVI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; CIVI-NEXT: s_waitcnt vmcnt(1) +; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; CIVI-NEXT: s_setpc_b64 s[30:31] +; +; GCN-LABEL: i32_fastcc_i32_i32_a32i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: i32_fastcc_i32_i32_a32i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -235,6 +491,25 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l ; FIXME: Why load and store same location for stack args? define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { +; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIVI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CIVI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt vmcnt(2) +; CIVI-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; CIVI-NEXT: s_waitcnt vmcnt(2) +; CIVI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; CIVI-NEXT: s_waitcnt vmcnt(2) +; CIVI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[4:5] +; ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -253,12 +528,50 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x ; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) ret i32 %ret } define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { +; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIVI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CIVI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CIVI-NEXT: v_mov_b32_e32 v34, 9 +; CIVI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; CIVI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; CIVI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[4:5] +; ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -277,6 +590,25 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i ; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v34, 9 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -289,6 +621,68 @@ entry: ; don't do a tail call. ; TODO: Do we really need this restriction? define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { +; CIVI-LABEL: no_sibling_call_callee_more_stack_space: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 s4, s33 +; CIVI-NEXT: s_mov_b32 s33, s32 +; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CIVI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CIVI-NEXT: s_mov_b64 exec, s[6:7] +; CIVI-NEXT: s_addk_i32 s32, 0x400 +; CIVI-NEXT: v_writelane_b32 v40, s4, 2 +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CIVI-NEXT: v_mov_b32_e32 v2, 0 +; CIVI-NEXT: v_writelane_b32 v40, s30, 0 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; CIVI-NEXT: v_mov_b32_e32 v2, 0 +; CIVI-NEXT: v_mov_b32_e32 v3, 0 +; CIVI-NEXT: v_mov_b32_e32 v4, 0 +; CIVI-NEXT: v_mov_b32_e32 v5, 0 +; CIVI-NEXT: v_mov_b32_e32 v6, 0 +; CIVI-NEXT: v_mov_b32_e32 v7, 0 +; CIVI-NEXT: v_mov_b32_e32 v8, 0 +; CIVI-NEXT: v_mov_b32_e32 v9, 0 +; CIVI-NEXT: v_mov_b32_e32 v10, 0 +; CIVI-NEXT: v_mov_b32_e32 v11, 0 +; CIVI-NEXT: v_mov_b32_e32 v12, 0 +; CIVI-NEXT: v_mov_b32_e32 v13, 0 +; CIVI-NEXT: v_mov_b32_e32 v14, 0 +; CIVI-NEXT: v_mov_b32_e32 v15, 0 +; CIVI-NEXT: v_mov_b32_e32 v16, 0 +; CIVI-NEXT: v_mov_b32_e32 v17, 0 +; CIVI-NEXT: v_mov_b32_e32 v18, 0 +; CIVI-NEXT: v_mov_b32_e32 v19, 0 +; CIVI-NEXT: v_mov_b32_e32 v20, 0 +; CIVI-NEXT: v_mov_b32_e32 v21, 0 +; CIVI-NEXT: v_mov_b32_e32 v22, 0 +; CIVI-NEXT: v_mov_b32_e32 v23, 0 +; CIVI-NEXT: v_mov_b32_e32 v24, 0 +; CIVI-NEXT: v_mov_b32_e32 v25, 0 +; CIVI-NEXT: v_mov_b32_e32 v26, 0 +; CIVI-NEXT: v_mov_b32_e32 v27, 0 +; CIVI-NEXT: v_mov_b32_e32 v28, 0 +; CIVI-NEXT: v_mov_b32_e32 v29, 0 +; CIVI-NEXT: v_mov_b32_e32 v30, 0 +; CIVI-NEXT: v_writelane_b32 v40, s31, 1 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CIVI-NEXT: v_readlane_b32 s31, v40, 1 +; CIVI-NEXT: v_readlane_b32 s30, v40, 0 +; CIVI-NEXT: s_mov_b32 s32, s33 +; CIVI-NEXT: v_readlane_b32 s4, v40, 2 +; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CIVI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CIVI-NEXT: s_mov_b64 exec, s[6:7] +; CIVI-NEXT: s_mov_b32 s33, s4 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] +; ; GCN-LABEL: no_sibling_call_callee_more_stack_space: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -350,6 +744,68 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: no_sibling_call_callee_more_stack_space: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s4, 2 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mov_b32_e32 v18, 0 +; GFX9-NEXT: v_mov_b32_e32 v19, 0 +; GFX9-NEXT: v_mov_b32_e32 v20, 0 +; GFX9-NEXT: v_mov_b32_e32 v21, 0 +; GFX9-NEXT: v_mov_b32_e32 v22, 0 +; GFX9-NEXT: v_mov_b32_e32 v23, 0 +; GFX9-NEXT: v_mov_b32_e32 v24, 0 +; GFX9-NEXT: v_mov_b32_e32 v25, 0 +; GFX9-NEXT: v_mov_b32_e32 v26, 0 +; GFX9-NEXT: v_mov_b32_e32 v27, 0 +; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: v_mov_b32_e32 v29, 0 +; GFX9-NEXT: v_mov_b32_e32 v30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) ret i32 %ret @@ -357,6 +813,46 @@ entry: ; Have another non-tail in the function define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { +; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 s4, s33 +; CIVI-NEXT: s_mov_b32 s33, s32 +; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CIVI-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; CIVI-NEXT: s_mov_b64 exec, s[6:7] +; CIVI-NEXT: s_addk_i32 s32, 0x400 +; CIVI-NEXT: v_writelane_b32 v42, s4, 2 +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CIVI-NEXT: v_writelane_b32 v42, s30, 0 +; CIVI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CIVI-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; CIVI-NEXT: v_writelane_b32 v42, s31, 1 +; CIVI-NEXT: v_mov_b32_e32 v40, v1 +; CIVI-NEXT: v_mov_b32_e32 v41, v0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CIVI-NEXT: v_mov_b32_e32 v2, v0 +; CIVI-NEXT: v_mov_b32_e32 v0, v41 +; CIVI-NEXT: v_mov_b32_e32 v1, v40 +; CIVI-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; CIVI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 +; CIVI-NEXT: v_readlane_b32 s31, v42, 1 +; CIVI-NEXT: v_readlane_b32 s30, v42, 0 +; CIVI-NEXT: s_mov_b32 s32, s33 +; CIVI-NEXT: v_readlane_b32 s6, v42, 2 +; CIVI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CIVI-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CIVI-NEXT: s_mov_b64 exec, s[8:9] +; CIVI-NEXT: s_mov_b32 s33, s6 +; CIVI-NEXT: s_setpc_b64 s[4:5] +; ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -396,6 +892,46 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i3 ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v42, s4, 2 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v40, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v41 +; GFX9-NEXT: v_mov_b32_e32 v1, v40 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: v_readlane_b32 s6, v42, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_setpc_b64 s[4:5] entry: %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call) @@ -405,6 +941,25 @@ entry: ; Have stack object in caller and stack passed arguments. SP should be ; in same place at function exit. define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { +; CIVI-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIVI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CIVI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CIVI-NEXT: v_mov_b32_e32 v34, 9 +; CIVI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; CIVI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; CIVI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[4:5] +; ; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -423,6 +978,25 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i3 ; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v34, 9 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -432,6 +1006,52 @@ entry: } define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { +; CIVI-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[4:5] +; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CIVI-NEXT: v_mov_b32_e32 v2, 9 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v2, 0 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; CIVI-NEXT: v_mov_b32_e32 v2, 0 +; CIVI-NEXT: v_mov_b32_e32 v3, 0 +; CIVI-NEXT: v_mov_b32_e32 v4, 0 +; CIVI-NEXT: v_mov_b32_e32 v5, 0 +; CIVI-NEXT: v_mov_b32_e32 v6, 0 +; CIVI-NEXT: v_mov_b32_e32 v7, 0 +; CIVI-NEXT: v_mov_b32_e32 v8, 0 +; CIVI-NEXT: v_mov_b32_e32 v9, 0 +; CIVI-NEXT: v_mov_b32_e32 v10, 0 +; CIVI-NEXT: v_mov_b32_e32 v11, 0 +; CIVI-NEXT: v_mov_b32_e32 v12, 0 +; CIVI-NEXT: v_mov_b32_e32 v13, 0 +; CIVI-NEXT: v_mov_b32_e32 v14, 0 +; CIVI-NEXT: v_mov_b32_e32 v15, 0 +; CIVI-NEXT: v_mov_b32_e32 v16, 0 +; CIVI-NEXT: v_mov_b32_e32 v17, 0 +; CIVI-NEXT: v_mov_b32_e32 v18, 0 +; CIVI-NEXT: v_mov_b32_e32 v19, 0 +; CIVI-NEXT: v_mov_b32_e32 v20, 0 +; CIVI-NEXT: v_mov_b32_e32 v21, 0 +; CIVI-NEXT: v_mov_b32_e32 v22, 0 +; CIVI-NEXT: v_mov_b32_e32 v23, 0 +; CIVI-NEXT: v_mov_b32_e32 v24, 0 +; CIVI-NEXT: v_mov_b32_e32 v25, 0 +; CIVI-NEXT: v_mov_b32_e32 v26, 0 +; CIVI-NEXT: v_mov_b32_e32 v27, 0 +; CIVI-NEXT: v_mov_b32_e32 v28, 0 +; CIVI-NEXT: v_mov_b32_e32 v29, 0 +; CIVI-NEXT: v_mov_b32_e32 v30, 0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[4:5] +; ; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -477,6 +1097,52 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg ; GCN-NEXT: v_mov_b32_e32 v30, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mov_b32_e32 v18, 0 +; GFX9-NEXT: v_mov_b32_e32 v19, 0 +; GFX9-NEXT: v_mov_b32_e32 v20, 0 +; GFX9-NEXT: v_mov_b32_e32 v21, 0 +; GFX9-NEXT: v_mov_b32_e32 v22, 0 +; GFX9-NEXT: v_mov_b32_e32 v23, 0 +; GFX9-NEXT: v_mov_b32_e32 v24, 0 +; GFX9-NEXT: v_mov_b32_e32 v25, 0 +; GFX9-NEXT: v_mov_b32_e32 v26, 0 +; GFX9-NEXT: v_mov_b32_e32 v27, 0 +; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: v_mov_b32_e32 v29, 0 +; GFX9-NEXT: v_mov_b32_e32 v30, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -489,6 +1155,18 @@ entry: ; Do support tail calls with a uniform, but unknown, callee. define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { +; CIVI-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[16:17] +; CIVI-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 +; CIVI-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 +; CIVI-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[16:17] +; ; GCN-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -500,6 +1178,18 @@ define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 % ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[16:17] +; +; GFX9-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[16:17] entry: %func.ptr.load = load ptr, ptr addrspace(4) @func_ptr_gv %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b) @@ -509,6 +1199,188 @@ entry: ; We can't support a tail call to a divergent target. Use a waterfall ; loop around a regular call define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr %func.ptr, i32 %a, i32 %b, i32 %c) #1 { +; CIVI-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 s16, s33 +; CIVI-NEXT: s_mov_b32 s33, s32 +; CIVI-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CIVI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CIVI-NEXT: s_mov_b64 exec, s[18:19] +; CIVI-NEXT: v_writelane_b32 v40, s16, 18 +; CIVI-NEXT: v_writelane_b32 v40, s30, 0 +; CIVI-NEXT: v_writelane_b32 v40, s31, 1 +; CIVI-NEXT: v_writelane_b32 v40, s34, 2 +; CIVI-NEXT: v_writelane_b32 v40, s35, 3 +; CIVI-NEXT: v_writelane_b32 v40, s36, 4 +; CIVI-NEXT: v_writelane_b32 v40, s37, 5 +; CIVI-NEXT: v_writelane_b32 v40, s38, 6 +; CIVI-NEXT: v_writelane_b32 v40, s39, 7 +; CIVI-NEXT: v_writelane_b32 v40, s40, 8 +; CIVI-NEXT: v_writelane_b32 v40, s41, 9 +; CIVI-NEXT: v_writelane_b32 v40, s42, 10 +; CIVI-NEXT: v_writelane_b32 v40, s43, 11 +; CIVI-NEXT: v_writelane_b32 v40, s44, 12 +; CIVI-NEXT: v_writelane_b32 v40, s45, 13 +; CIVI-NEXT: v_writelane_b32 v40, s46, 14 +; CIVI-NEXT: v_writelane_b32 v40, s47, 15 +; CIVI-NEXT: v_writelane_b32 v40, s48, 16 +; CIVI-NEXT: s_mov_b32 s42, s15 +; CIVI-NEXT: s_mov_b32 s43, s14 +; CIVI-NEXT: s_mov_b32 s44, s13 +; CIVI-NEXT: s_mov_b32 s45, s12 +; CIVI-NEXT: s_mov_b64 s[34:35], s[10:11] +; CIVI-NEXT: s_mov_b64 s[36:37], s[8:9] +; CIVI-NEXT: s_mov_b64 s[38:39], s[6:7] +; CIVI-NEXT: s_mov_b64 s[40:41], s[4:5] +; CIVI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; CIVI-NEXT: s_mov_b64 s[46:47], exec +; CIVI-NEXT: s_addk_i32 s32, 0x400 +; CIVI-NEXT: v_writelane_b32 v40, s49, 17 +; CIVI-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; CIVI-NEXT: v_readfirstlane_b32 s16, v0 +; CIVI-NEXT: v_readfirstlane_b32 s17, v1 +; CIVI-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] +; CIVI-NEXT: s_and_saveexec_b64 s[48:49], vcc +; CIVI-NEXT: s_mov_b64 s[4:5], s[40:41] +; CIVI-NEXT: s_mov_b64 s[6:7], s[38:39] +; CIVI-NEXT: s_mov_b64 s[8:9], s[36:37] +; CIVI-NEXT: s_mov_b64 s[10:11], s[34:35] +; CIVI-NEXT: s_mov_b32 s12, s45 +; CIVI-NEXT: s_mov_b32 s13, s44 +; CIVI-NEXT: s_mov_b32 s14, s43 +; CIVI-NEXT: s_mov_b32 s15, s42 +; CIVI-NEXT: v_mov_b32_e32 v0, v2 +; CIVI-NEXT: v_mov_b32_e32 v1, v3 +; CIVI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CIVI-NEXT: v_mov_b32_e32 v4, v0 +; CIVI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CIVI-NEXT: ; implicit-def: $vgpr31 +; CIVI-NEXT: ; implicit-def: $vgpr2 +; CIVI-NEXT: ; implicit-def: $vgpr3 +; CIVI-NEXT: s_xor_b64 exec, exec, s[48:49] +; CIVI-NEXT: s_cbranch_execnz .LBB18_1 +; CIVI-NEXT: ; %bb.2: +; CIVI-NEXT: s_mov_b64 exec, s[46:47] +; CIVI-NEXT: v_mov_b32_e32 v0, v4 +; CIVI-NEXT: v_readlane_b32 s49, v40, 17 +; CIVI-NEXT: v_readlane_b32 s48, v40, 16 +; CIVI-NEXT: v_readlane_b32 s47, v40, 15 +; CIVI-NEXT: v_readlane_b32 s46, v40, 14 +; CIVI-NEXT: v_readlane_b32 s45, v40, 13 +; CIVI-NEXT: v_readlane_b32 s44, v40, 12 +; CIVI-NEXT: v_readlane_b32 s43, v40, 11 +; CIVI-NEXT: v_readlane_b32 s42, v40, 10 +; CIVI-NEXT: v_readlane_b32 s41, v40, 9 +; CIVI-NEXT: v_readlane_b32 s40, v40, 8 +; CIVI-NEXT: v_readlane_b32 s39, v40, 7 +; CIVI-NEXT: v_readlane_b32 s38, v40, 6 +; CIVI-NEXT: v_readlane_b32 s37, v40, 5 +; CIVI-NEXT: v_readlane_b32 s36, v40, 4 +; CIVI-NEXT: v_readlane_b32 s35, v40, 3 +; CIVI-NEXT: v_readlane_b32 s34, v40, 2 +; CIVI-NEXT: v_readlane_b32 s31, v40, 1 +; CIVI-NEXT: v_readlane_b32 s30, v40, 0 +; CIVI-NEXT: s_mov_b32 s32, s33 +; CIVI-NEXT: v_readlane_b32 s4, v40, 18 +; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CIVI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CIVI-NEXT: s_mov_b64 exec, s[6:7] +; CIVI-NEXT: s_mov_b32 s33, s4 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] +; +; GCN-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s16, 18 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: v_writelane_b32 v40, s34, 2 +; GCN-NEXT: v_writelane_b32 v40, s35, 3 +; GCN-NEXT: v_writelane_b32 v40, s36, 4 +; GCN-NEXT: v_writelane_b32 v40, s37, 5 +; GCN-NEXT: v_writelane_b32 v40, s38, 6 +; GCN-NEXT: v_writelane_b32 v40, s39, 7 +; GCN-NEXT: v_writelane_b32 v40, s40, 8 +; GCN-NEXT: v_writelane_b32 v40, s41, 9 +; GCN-NEXT: v_writelane_b32 v40, s42, 10 +; GCN-NEXT: v_writelane_b32 v40, s43, 11 +; GCN-NEXT: v_writelane_b32 v40, s44, 12 +; GCN-NEXT: v_writelane_b32 v40, s45, 13 +; GCN-NEXT: v_writelane_b32 v40, s46, 14 +; GCN-NEXT: v_writelane_b32 v40, s47, 15 +; GCN-NEXT: v_writelane_b32 v40, s48, 16 +; GCN-NEXT: s_mov_b32 s42, s15 +; GCN-NEXT: s_mov_b32 s43, s14 +; GCN-NEXT: s_mov_b32 s44, s13 +; GCN-NEXT: s_mov_b32 s45, s12 +; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] +; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] +; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] +; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: s_mov_b64 s[46:47], exec +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s49, 17 +; GCN-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] +; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc +; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] +; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] +; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] +; GCN-NEXT: s_mov_b32 s12, s45 +; GCN-NEXT: s_mov_b32 s13, s44 +; GCN-NEXT: s_mov_b32 s14, s43 +; GCN-NEXT: s_mov_b32 s15, s42 +; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: v_mov_b32_e32 v1, v3 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_mov_b32_e32 v4, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] +; GCN-NEXT: s_cbranch_execnz .LBB18_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[46:47] +; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_readlane_b32 s49, v40, 17 +; GCN-NEXT: v_readlane_b32 s48, v40, 16 +; GCN-NEXT: v_readlane_b32 s47, v40, 15 +; GCN-NEXT: v_readlane_b32 s46, v40, 14 +; GCN-NEXT: v_readlane_b32 s45, v40, 13 +; GCN-NEXT: v_readlane_b32 s44, v40, 12 +; GCN-NEXT: v_readlane_b32 s43, v40, 11 +; GCN-NEXT: v_readlane_b32 s42, v40, 10 +; GCN-NEXT: v_readlane_b32 s41, v40, 9 +; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: v_readlane_b32 s39, v40, 7 +; GCN-NEXT: v_readlane_b32 s38, v40, 6 +; GCN-NEXT: v_readlane_b32 s37, v40, 5 +; GCN-NEXT: v_readlane_b32 s36, v40, 4 +; GCN-NEXT: v_readlane_b32 s35, v40, 3 +; GCN-NEXT: v_readlane_b32 s34, v40, 2 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 18 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -608,6 +1480,30 @@ entry: declare hidden void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) align 16, ptr addrspace(5) byval([2 x i64])) define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { +; CIVI-LABEL: sibling_call_fastcc_multi_byval: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v1, 9 +; CIVI-NEXT: v_mov_b32_e32 v2, 0 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; CIVI-NEXT: s_getpc_b64 s[16:17] +; CIVI-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 +; CIVI-NEXT: s_setpc_b64 s[16:17] +; ; GCN-LABEL: sibling_call_fastcc_multi_byval: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -631,6 +1527,30 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { ; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 ; GCN-NEXT: s_setpc_b64 s[16:17] +; +; GFX9-LABEL: sibling_call_fastcc_multi_byval: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 9 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 +; GFX9-NEXT: s_setpc_b64 s[16:17] entry: %alloca0 = alloca [3 x i32], align 16, addrspace(5) %alloca1 = alloca [2 x i64], align 8, addrspace(5) @@ -644,6 +1564,55 @@ declare hidden void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([ ; Callee has a byval and non-byval stack passed argument define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 { +; CIVI-LABEL: sibling_call_byval_and_stack_passed: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v1, 9 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; CIVI-NEXT: v_mov_b32_e32 v1, 0 +; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 +; CIVI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; CIVI-NEXT: v_mov_b32_e32 v0, 0 +; CIVI-NEXT: v_mov_b32_e32 v1, 0 +; CIVI-NEXT: v_mov_b32_e32 v2, 0 +; CIVI-NEXT: v_mov_b32_e32 v3, 0 +; CIVI-NEXT: v_mov_b32_e32 v4, 0 +; CIVI-NEXT: v_mov_b32_e32 v5, 0 +; CIVI-NEXT: v_mov_b32_e32 v6, 0 +; CIVI-NEXT: v_mov_b32_e32 v7, 0 +; CIVI-NEXT: v_mov_b32_e32 v8, 0 +; CIVI-NEXT: v_mov_b32_e32 v9, 0 +; CIVI-NEXT: v_mov_b32_e32 v10, 0 +; CIVI-NEXT: v_mov_b32_e32 v11, 0 +; CIVI-NEXT: v_mov_b32_e32 v12, 0 +; CIVI-NEXT: v_mov_b32_e32 v13, 0 +; CIVI-NEXT: v_mov_b32_e32 v14, 0 +; CIVI-NEXT: v_mov_b32_e32 v15, 0 +; CIVI-NEXT: v_mov_b32_e32 v16, 0 +; CIVI-NEXT: v_mov_b32_e32 v17, 0 +; CIVI-NEXT: v_mov_b32_e32 v18, 0 +; CIVI-NEXT: v_mov_b32_e32 v19, 0 +; CIVI-NEXT: v_mov_b32_e32 v20, 0 +; CIVI-NEXT: v_mov_b32_e32 v21, 0 +; CIVI-NEXT: v_mov_b32_e32 v22, 0 +; CIVI-NEXT: v_mov_b32_e32 v23, 0 +; CIVI-NEXT: v_mov_b32_e32 v24, 0 +; CIVI-NEXT: v_mov_b32_e32 v25, 0 +; CIVI-NEXT: v_mov_b32_e32 v26, 0 +; CIVI-NEXT: v_mov_b32_e32 v27, 0 +; CIVI-NEXT: v_mov_b32_e32 v28, 0 +; CIVI-NEXT: v_mov_b32_e32 v29, 0 +; CIVI-NEXT: v_mov_b32_e32 v30, 0 +; CIVI-NEXT: s_getpc_b64 s[16:17] +; CIVI-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 +; CIVI-NEXT: s_setpc_b64 s[16:17] +; ; GCN-LABEL: sibling_call_byval_and_stack_passed: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -692,6 +1661,55 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 ; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 ; GCN-NEXT: s_setpc_b64 s[16:17] +; +; GFX9-LABEL: sibling_call_byval_and_stack_passed: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mov_b32_e32 v18, 0 +; GFX9-NEXT: v_mov_b32_e32 v19, 0 +; GFX9-NEXT: v_mov_b32_e32 v20, 0 +; GFX9-NEXT: v_mov_b32_e32 v21, 0 +; GFX9-NEXT: v_mov_b32_e32 v22, 0 +; GFX9-NEXT: v_mov_b32_e32 v23, 0 +; GFX9-NEXT: v_mov_b32_e32 v24, 0 +; GFX9-NEXT: v_mov_b32_e32 v25, 0 +; GFX9-NEXT: v_mov_b32_e32 v26, 0 +; GFX9-NEXT: v_mov_b32_e32 v27, 0 +; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: v_mov_b32_e32 v29, 0 +; GFX9-NEXT: v_mov_b32_e32 v30, 0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 +; GFX9-NEXT: s_setpc_b64 s[16:17] entry: %alloca = alloca [3 x i32], align 16, addrspace(5) store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca @@ -702,6 +1720,14 @@ entry: declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0) define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 { +; CIVI-LABEL: sibling_call_i64_fastcc_i64: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[16:17] +; CIVI-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 +; CIVI-NEXT: s_setpc_b64 s[16:17] +; ; GCN-LABEL: sibling_call_i64_fastcc_i64: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -709,6 +1735,14 @@ define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 { ; GCN-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 ; GCN-NEXT: s_setpc_b64 s[16:17] +; +; GFX9-LABEL: sibling_call_i64_fastcc_i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 +; GFX9-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a) ret i64 %ret @@ -717,6 +1751,14 @@ entry: declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0) define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 { +; CIVI-LABEL: sibling_call_p1i8_fastcc_p1i8: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[16:17] +; CIVI-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 +; CIVI-NEXT: s_setpc_b64 s[16:17] +; ; GCN-LABEL: sibling_call_p1i8_fastcc_p1i8: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -724,6 +1766,14 @@ define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspa ; GCN-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 ; GCN-NEXT: s_setpc_b64 s[16:17] +; +; GFX9-LABEL: sibling_call_p1i8_fastcc_p1i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 +; GFX9-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a) ret ptr addrspace(1) %ret @@ -732,6 +1782,14 @@ entry: declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0) define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 { +; CIVI-LABEL: sibling_call_i16_fastcc_i16: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[16:17] +; CIVI-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 +; CIVI-NEXT: s_setpc_b64 s[16:17] +; ; GCN-LABEL: sibling_call_i16_fastcc_i16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -739,6 +1797,14 @@ define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 { ; GCN-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 ; GCN-NEXT: s_setpc_b64 s[16:17] +; +; GFX9-LABEL: sibling_call_i16_fastcc_i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 +; GFX9-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a) ret i16 %ret @@ -747,6 +1813,14 @@ entry: declare hidden fastcc half @f16_fastcc_f16(half %arg0) define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 { +; CIVI-LABEL: sibling_call_f16_fastcc_f16: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[16:17] +; CIVI-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 +; CIVI-NEXT: s_setpc_b64 s[16:17] +; ; GCN-LABEL: sibling_call_f16_fastcc_f16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -754,6 +1828,14 @@ define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 { ; GCN-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 ; GCN-NEXT: s_setpc_b64 s[16:17] +; +; GFX9-LABEL: sibling_call_f16_fastcc_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 +; GFX9-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc half @f16_fastcc_f16(half %a) ret half %ret @@ -762,6 +1844,14 @@ entry: declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0) define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 { +; CIVI-LABEL: sibling_call_v3i16_fastcc_v3i16: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[16:17] +; CIVI-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 +; CIVI-NEXT: s_setpc_b64 s[16:17] +; ; GCN-LABEL: sibling_call_v3i16_fastcc_v3i16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -769,6 +1859,14 @@ define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 ; GCN-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 ; GCN-NEXT: s_setpc_b64 s[16:17] +; +; GFX9-LABEL: sibling_call_v3i16_fastcc_v3i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 +; GFX9-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a) ret <3 x i16> %ret @@ -777,6 +1875,14 @@ entry: declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0) define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 { +; CIVI-LABEL: sibling_call_v4i16_fastcc_v4i16: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[16:17] +; CIVI-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 +; CIVI-NEXT: s_setpc_b64 s[16:17] +; ; GCN-LABEL: sibling_call_v4i16_fastcc_v4i16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -784,6 +1890,14 @@ define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 ; GCN-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 ; GCN-NEXT: s_setpc_b64 s[16:17] +; +; GFX9-LABEL: sibling_call_v4i16_fastcc_v4i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 +; GFX9-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a) ret <4 x i16> %ret @@ -792,6 +1906,14 @@ entry: declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0) define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 { +; CIVI-LABEL: sibling_call_v2i64_fastcc_v2i64: +; CIVI: ; %bb.0: ; %entry +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_getpc_b64 s[16:17] +; CIVI-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 +; CIVI-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 +; CIVI-NEXT: s_setpc_b64 s[16:17] +; ; GCN-LABEL: sibling_call_v2i64_fastcc_v2i64: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -799,6 +1921,14 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 ; GCN-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 ; GCN-NEXT: s_setpc_b64 s[16:17] +; +; GFX9-LABEL: sibling_call_v2i64_fastcc_v2i64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 +; GFX9-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a) ret <2 x i64> %ret From d999f57215535746e23641c1aff87420ad69c113 Mon Sep 17 00:00:00 2001 From: easyonaadit Date: Mon, 27 Jan 2025 11:47:32 +0530 Subject: [PATCH 3/4] Update run lines --- llvm/test/CodeGen/AMDGPU/sibling-call.ll | 2196 +++++++++++----------- 1 file changed, 1098 insertions(+), 1098 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index e20248426324f..c79d60c2a951f 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -1,22 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FIJI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=HAWAII %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s target datalayout = "A5" ; FIXME: Why is this commuted only sometimes? define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { -; CIVI-LABEL: i32_fastcc_i32_i32: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; CIVI-NEXT: s_setpc_b64 s[30:31] +; FIJI-LABEL: i32_fastcc_i32_i32: +; FIJI: ; %bb.0: +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; FIJI-NEXT: s_setpc_b64 s[30:31] ; -; GCN-LABEL: i32_fastcc_i32_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; HAWAII-LABEL: i32_fastcc_i32_i32: +; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; HAWAII-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: i32_fastcc_i32_i32: ; GFX9: ; %bb.0: @@ -28,23 +28,23 @@ define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { } define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { -; CIVI-LABEL: i32_fastcc_i32_i32_stack_object: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_mov_b32_e32 v2, 9 -; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; CIVI-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; FIJI-LABEL: i32_fastcc_i32_i32_stack_object: +; FIJI: ; %bb.0: +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, 9 +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[30:31] ; -; GCN-LABEL: i32_fastcc_i32_i32_stack_object: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 9 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; HAWAII-LABEL: i32_fastcc_i32_i32_stack_object: +; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, 9 +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: i32_fastcc_i32_i32_stack_object: ; GFX9: ; %bb.0: @@ -62,25 +62,25 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { } define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { -; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[4:5] +; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[4:5] ; -; GCN-LABEL: sibling_call_i32_fastcc_i32_i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[4:5] ; ; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32: ; GFX9: ; %bb.0: ; %entry @@ -97,29 +97,29 @@ entry: } define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { -; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CIVI-NEXT: v_mov_b32_e32 v2, 9 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; CIVI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[4:5] +; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: v_mov_b32_e32 v2, 9 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[4:5] ; -; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, 9 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: v_mov_b32_e32 v2, 9 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; HAWAII-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[4:5] ; ; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: ; GFX9: ; %bb.0: ; %entry @@ -141,29 +141,29 @@ entry: } define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { -; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CIVI-NEXT: v_mov_b32_e32 v2, 9 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; CIVI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[4:5] +; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: v_mov_b32_e32 v2, 9 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[4:5] ; -; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, 9 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: v_mov_b32_e32 v2, 9 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; HAWAII-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[4:5] ; ; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: ; GFX9: ; %bb.0: ; %entry @@ -185,25 +185,25 @@ entry: } define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { -; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[4:5] +; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[4:5] ; -; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[4:5] ; ; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: ; GFX9: ; %bb.0: ; %entry @@ -221,43 +221,43 @@ entry: ; It doesn't make sense to do a tail from a kernel define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { -; CIVI-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_add_i32 s6, s6, s9 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; CIVI-NEXT: s_add_u32 s0, s0, s9 -; CIVI-NEXT: s_addc_u32 s1, s1, 0 -; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s7 -; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CIVI-NEXT: s_getpc_b64 s[6:7] -; CIVI-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CIVI-NEXT: s_mov_b32 s32, 0 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: v_mov_b32_e32 v0, s4 -; CIVI-NEXT: v_mov_b32_e32 v1, s5 -; CIVI-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CIVI-NEXT: s_endpgm +; FIJI-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_add_i32 s6, s6, s9 +; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; FIJI-NEXT: s_add_u32 s0, s0, s9 +; FIJI-NEXT: s_addc_u32 s1, s1, 0 +; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: s_getpc_b64 s[6:7] +; FIJI-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; FIJI-NEXT: s_mov_b32 s32, 0 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v0, s4 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: s_swappc_b64 s[30:31], s[6:7] +; FIJI-NEXT: s_endpgm ; -; GCN-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_add_i32 s6, s6, s9 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 -; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_endpgm +; HAWAII-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_add_i32 s6, s6, s9 +; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HAWAII-NEXT: s_add_u32 s0, s0, s9 +; HAWAII-NEXT: s_addc_u32 s1, s1, 0 +; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: s_getpc_b64 s[6:7] +; HAWAII-NEXT: s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; HAWAII-NEXT: s_mov_b32 s32, 0 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v0, s4 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 +; HAWAII-NEXT: s_swappc_b64 s[30:31], s[6:7] +; HAWAII-NEXT: s_endpgm ; ; GFX9-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result: ; GFX9: ; %bb.0: ; %entry @@ -282,21 +282,21 @@ entry: } define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) byval(i32) align 4 %arg1) #1 { -; CIVI-LABEL: i32_fastcc_i32_byval_i32: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; CIVI-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; CIVI-NEXT: s_setpc_b64 s[30:31] +; FIJI-LABEL: i32_fastcc_i32_byval_i32: +; FIJI: ; %bb.0: +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; FIJI-NEXT: s_setpc_b64 s[30:31] ; -; GCN-LABEL: i32_fastcc_i32_byval_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; HAWAII-LABEL: i32_fastcc_i32_byval_i32: +; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; HAWAII-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: i32_fastcc_i32_byval_i32: ; GFX9: ; %bb.0: @@ -312,65 +312,65 @@ define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) b ; Tail call disallowed with byval in parent. define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, ptr addrspace(5) byval(i32) %b.byval, i32 %c) #1 { -; CIVI-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 s4, s33 -; CIVI-NEXT: s_mov_b32 s33, s32 -; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CIVI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CIVI-NEXT: s_mov_b64 exec, s[6:7] -; CIVI-NEXT: buffer_load_dword v1, off, s[0:3], s33 -; CIVI-NEXT: v_writelane_b32 v40, s4, 2 -; CIVI-NEXT: s_addk_i32 s32, 0x400 -; CIVI-NEXT: v_writelane_b32 v40, s30, 0 -; CIVI-NEXT: v_writelane_b32 v40, s31, 1 -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 -; CIVI-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; CIVI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CIVI-NEXT: v_readlane_b32 s31, v40, 1 -; CIVI-NEXT: v_readlane_b32 s30, v40, 0 -; CIVI-NEXT: s_mov_b32 s32, s33 -; CIVI-NEXT: v_readlane_b32 s4, v40, 2 -; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CIVI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CIVI-NEXT: s_mov_b64 exec, s[6:7] -; CIVI-NEXT: s_mov_b32 s33, s4 -; CIVI-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; FIJI-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_mov_b32 s4, s33 +; FIJI-NEXT: s_mov_b32 s33, s32 +; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; FIJI-NEXT: s_mov_b64 exec, s[6:7] +; FIJI-NEXT: buffer_load_dword v1, off, s[0:3], s33 +; FIJI-NEXT: v_writelane_b32 v40, s4, 2 +; FIJI-NEXT: s_addk_i32 s32, 0x400 +; FIJI-NEXT: v_writelane_b32 v40, s30, 0 +; FIJI-NEXT: v_writelane_b32 v40, s31, 1 +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; FIJI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; FIJI-NEXT: v_readlane_b32 s31, v40, 1 +; FIJI-NEXT: v_readlane_b32 s30, v40, 0 +; FIJI-NEXT: s_mov_b32 s32, s33 +; FIJI-NEXT: v_readlane_b32 s4, v40, 2 +; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; FIJI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; FIJI-NEXT: s_mov_b64 exec, s[6:7] +; FIJI-NEXT: s_mov_b32 s33, s4 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[30:31] ; -; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 -; GCN-NEXT: v_writelane_b32 v40, s4, 2 -; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: v_readlane_b32 s4, v40, 2 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; HAWAII-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_mov_b32 s4, s33 +; HAWAII-NEXT: s_mov_b32 s33, s32 +; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; HAWAII-NEXT: s_mov_b64 exec, s[6:7] +; HAWAII-NEXT: buffer_load_dword v1, off, s[0:3], s33 +; HAWAII-NEXT: v_writelane_b32 v40, s4, 2 +; HAWAII-NEXT: s_addk_i32 s32, 0x400 +; HAWAII-NEXT: v_writelane_b32 v40, s30, 0 +; HAWAII-NEXT: v_writelane_b32 v40, s31, 1 +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; HAWAII-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HAWAII-NEXT: v_readlane_b32 s31, v40, 1 +; HAWAII-NEXT: v_readlane_b32 s30, v40, 0 +; HAWAII-NEXT: s_mov_b32 s32, s33 +; HAWAII-NEXT: v_readlane_b32 s4, v40, 2 +; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HAWAII-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; HAWAII-NEXT: s_mov_b64 exec, s[6:7] +; HAWAII-NEXT: s_mov_b32 s33, s4 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: ; GFX9: ; %bb.0: ; %entry @@ -410,27 +410,27 @@ entry: ; usage of incoming arguments must be <= the outgoing stack ; arguments. define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 { -; CIVI-LABEL: sibling_call_i32_fastcc_i32_byval_i32: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 -; CIVI-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; CIVI-NEXT: s_setpc_b64 s[4:5] +; FIJI-LABEL: sibling_call_i32_fastcc_i32_byval_i32: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; FIJI-NEXT: s_setpc_b64 s[4:5] ; -; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GCN-NEXT: s_setpc_b64 s[4:5] +; HAWAII-LABEL: sibling_call_i32_fastcc_i32_byval_i32: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; HAWAII-NEXT: s_setpc_b64 s[4:5] ; ; GFX9-LABEL: sibling_call_i32_fastcc_i32_byval_i32: ; GFX9: ; %bb.0: ; %entry @@ -448,29 +448,29 @@ entry: } define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { -; CIVI-LABEL: i32_fastcc_i32_i32_a32i32: -; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; CIVI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; CIVI-NEXT: s_waitcnt vmcnt(1) -; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; CIVI-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 -; CIVI-NEXT: s_setpc_b64 s[30:31] +; FIJI-LABEL: i32_fastcc_i32_i32_a32i32: +; FIJI: ; %bb.0: +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; FIJI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; FIJI-NEXT: s_waitcnt vmcnt(1) +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; FIJI-NEXT: s_setpc_b64 s[30:31] ; -; GCN-LABEL: i32_fastcc_i32_i32_a32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: s_setpc_b64 s[30:31] +; HAWAII-LABEL: i32_fastcc_i32_i32_a32i32: +; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; HAWAII-NEXT: s_waitcnt vmcnt(1) +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; HAWAII-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: i32_fastcc_i32_i32_a32i32: ; GFX9: ; %bb.0: @@ -491,43 +491,43 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l ; FIXME: Why load and store same location for stack args? define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { -; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CIVI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; CIVI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CIVI-NEXT: s_waitcnt vmcnt(2) -; CIVI-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; CIVI-NEXT: s_waitcnt vmcnt(2) -; CIVI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; CIVI-NEXT: s_waitcnt vmcnt(2) -; CIVI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[4:5] +; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; FIJI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; FIJI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: s_waitcnt vmcnt(2) +; FIJI-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; FIJI-NEXT: s_waitcnt vmcnt(2) +; FIJI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; FIJI-NEXT: s_waitcnt vmcnt(2) +; FIJI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[4:5] ; -; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; HAWAII-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: s_waitcnt vmcnt(2) +; HAWAII-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; HAWAII-NEXT: s_waitcnt vmcnt(2) +; HAWAII-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: s_waitcnt vmcnt(2) +; HAWAII-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[4:5] ; ; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: ; GFX9: ; %bb.0: ; %entry @@ -553,43 +553,43 @@ entry: } define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { -; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CIVI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; CIVI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CIVI-NEXT: v_mov_b32_e32 v34, 9 -; CIVI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 -; CIVI-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; CIVI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; CIVI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[4:5] +; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; FIJI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; FIJI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: v_mov_b32_e32 v34, 9 +; FIJI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; FIJI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; FIJI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[4:5] ; -; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v34, 9 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; HAWAII-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: v_mov_b32_e32 v34, 9 +; HAWAII-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; HAWAII-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[4:5] ; ; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: ; GFX9: ; %bb.0: ; %entry @@ -621,129 +621,129 @@ entry: ; don't do a tail call. ; TODO: Do we really need this restriction? define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { -; CIVI-LABEL: no_sibling_call_callee_more_stack_space: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 s4, s33 -; CIVI-NEXT: s_mov_b32 s33, s32 -; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CIVI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CIVI-NEXT: s_mov_b64 exec, s[6:7] -; CIVI-NEXT: s_addk_i32 s32, 0x400 -; CIVI-NEXT: v_writelane_b32 v40, s4, 2 -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CIVI-NEXT: v_mov_b32_e32 v2, 0 -; CIVI-NEXT: v_writelane_b32 v40, s30, 0 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; CIVI-NEXT: v_mov_b32_e32 v2, 0 -; CIVI-NEXT: v_mov_b32_e32 v3, 0 -; CIVI-NEXT: v_mov_b32_e32 v4, 0 -; CIVI-NEXT: v_mov_b32_e32 v5, 0 -; CIVI-NEXT: v_mov_b32_e32 v6, 0 -; CIVI-NEXT: v_mov_b32_e32 v7, 0 -; CIVI-NEXT: v_mov_b32_e32 v8, 0 -; CIVI-NEXT: v_mov_b32_e32 v9, 0 -; CIVI-NEXT: v_mov_b32_e32 v10, 0 -; CIVI-NEXT: v_mov_b32_e32 v11, 0 -; CIVI-NEXT: v_mov_b32_e32 v12, 0 -; CIVI-NEXT: v_mov_b32_e32 v13, 0 -; CIVI-NEXT: v_mov_b32_e32 v14, 0 -; CIVI-NEXT: v_mov_b32_e32 v15, 0 -; CIVI-NEXT: v_mov_b32_e32 v16, 0 -; CIVI-NEXT: v_mov_b32_e32 v17, 0 -; CIVI-NEXT: v_mov_b32_e32 v18, 0 -; CIVI-NEXT: v_mov_b32_e32 v19, 0 -; CIVI-NEXT: v_mov_b32_e32 v20, 0 -; CIVI-NEXT: v_mov_b32_e32 v21, 0 -; CIVI-NEXT: v_mov_b32_e32 v22, 0 -; CIVI-NEXT: v_mov_b32_e32 v23, 0 -; CIVI-NEXT: v_mov_b32_e32 v24, 0 -; CIVI-NEXT: v_mov_b32_e32 v25, 0 -; CIVI-NEXT: v_mov_b32_e32 v26, 0 -; CIVI-NEXT: v_mov_b32_e32 v27, 0 -; CIVI-NEXT: v_mov_b32_e32 v28, 0 -; CIVI-NEXT: v_mov_b32_e32 v29, 0 -; CIVI-NEXT: v_mov_b32_e32 v30, 0 -; CIVI-NEXT: v_writelane_b32 v40, s31, 1 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CIVI-NEXT: v_readlane_b32 s31, v40, 1 -; CIVI-NEXT: v_readlane_b32 s30, v40, 0 -; CIVI-NEXT: s_mov_b32 s32, s33 -; CIVI-NEXT: v_readlane_b32 s4, v40, 2 -; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CIVI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CIVI-NEXT: s_mov_b64 exec, s[6:7] -; CIVI-NEXT: s_mov_b32 s33, s4 -; CIVI-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; FIJI-LABEL: no_sibling_call_callee_more_stack_space: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_mov_b32 s4, s33 +; FIJI-NEXT: s_mov_b32 s33, s32 +; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; FIJI-NEXT: s_mov_b64 exec, s[6:7] +; FIJI-NEXT: s_addk_i32 s32, 0x400 +; FIJI-NEXT: v_writelane_b32 v40, s4, 2 +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: v_mov_b32_e32 v2, 0 +; FIJI-NEXT: v_writelane_b32 v40, s30, 0 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; FIJI-NEXT: v_mov_b32_e32 v2, 0 +; FIJI-NEXT: v_mov_b32_e32 v3, 0 +; FIJI-NEXT: v_mov_b32_e32 v4, 0 +; FIJI-NEXT: v_mov_b32_e32 v5, 0 +; FIJI-NEXT: v_mov_b32_e32 v6, 0 +; FIJI-NEXT: v_mov_b32_e32 v7, 0 +; FIJI-NEXT: v_mov_b32_e32 v8, 0 +; FIJI-NEXT: v_mov_b32_e32 v9, 0 +; FIJI-NEXT: v_mov_b32_e32 v10, 0 +; FIJI-NEXT: v_mov_b32_e32 v11, 0 +; FIJI-NEXT: v_mov_b32_e32 v12, 0 +; FIJI-NEXT: v_mov_b32_e32 v13, 0 +; FIJI-NEXT: v_mov_b32_e32 v14, 0 +; FIJI-NEXT: v_mov_b32_e32 v15, 0 +; FIJI-NEXT: v_mov_b32_e32 v16, 0 +; FIJI-NEXT: v_mov_b32_e32 v17, 0 +; FIJI-NEXT: v_mov_b32_e32 v18, 0 +; FIJI-NEXT: v_mov_b32_e32 v19, 0 +; FIJI-NEXT: v_mov_b32_e32 v20, 0 +; FIJI-NEXT: v_mov_b32_e32 v21, 0 +; FIJI-NEXT: v_mov_b32_e32 v22, 0 +; FIJI-NEXT: v_mov_b32_e32 v23, 0 +; FIJI-NEXT: v_mov_b32_e32 v24, 0 +; FIJI-NEXT: v_mov_b32_e32 v25, 0 +; FIJI-NEXT: v_mov_b32_e32 v26, 0 +; FIJI-NEXT: v_mov_b32_e32 v27, 0 +; FIJI-NEXT: v_mov_b32_e32 v28, 0 +; FIJI-NEXT: v_mov_b32_e32 v29, 0 +; FIJI-NEXT: v_mov_b32_e32 v30, 0 +; FIJI-NEXT: v_writelane_b32 v40, s31, 1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; FIJI-NEXT: v_readlane_b32 s31, v40, 1 +; FIJI-NEXT: v_readlane_b32 s30, v40, 0 +; FIJI-NEXT: s_mov_b32 s32, s33 +; FIJI-NEXT: v_readlane_b32 s4, v40, 2 +; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; FIJI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; FIJI-NEXT: s_mov_b64 exec, s[6:7] +; FIJI-NEXT: s_mov_b32 s33, s4 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[30:31] ; -; GCN-LABEL: no_sibling_call_callee_more_stack_space: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v40, s4, 2 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: v_readlane_b32 s4, v40, 2 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; HAWAII-LABEL: no_sibling_call_callee_more_stack_space: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_mov_b32 s4, s33 +; HAWAII-NEXT: s_mov_b32 s33, s32 +; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HAWAII-NEXT: s_mov_b64 exec, s[6:7] +; HAWAII-NEXT: s_addk_i32 s32, 0x400 +; HAWAII-NEXT: v_writelane_b32 v40, s4, 2 +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: v_mov_b32_e32 v2, 0 +; HAWAII-NEXT: v_writelane_b32 v40, s30, 0 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: v_mov_b32_e32 v2, 0 +; HAWAII-NEXT: v_mov_b32_e32 v3, 0 +; HAWAII-NEXT: v_mov_b32_e32 v4, 0 +; HAWAII-NEXT: v_mov_b32_e32 v5, 0 +; HAWAII-NEXT: v_mov_b32_e32 v6, 0 +; HAWAII-NEXT: v_mov_b32_e32 v7, 0 +; HAWAII-NEXT: v_mov_b32_e32 v8, 0 +; HAWAII-NEXT: v_mov_b32_e32 v9, 0 +; HAWAII-NEXT: v_mov_b32_e32 v10, 0 +; HAWAII-NEXT: v_mov_b32_e32 v11, 0 +; HAWAII-NEXT: v_mov_b32_e32 v12, 0 +; HAWAII-NEXT: v_mov_b32_e32 v13, 0 +; HAWAII-NEXT: v_mov_b32_e32 v14, 0 +; HAWAII-NEXT: v_mov_b32_e32 v15, 0 +; HAWAII-NEXT: v_mov_b32_e32 v16, 0 +; HAWAII-NEXT: v_mov_b32_e32 v17, 0 +; HAWAII-NEXT: v_mov_b32_e32 v18, 0 +; HAWAII-NEXT: v_mov_b32_e32 v19, 0 +; HAWAII-NEXT: v_mov_b32_e32 v20, 0 +; HAWAII-NEXT: v_mov_b32_e32 v21, 0 +; HAWAII-NEXT: v_mov_b32_e32 v22, 0 +; HAWAII-NEXT: v_mov_b32_e32 v23, 0 +; HAWAII-NEXT: v_mov_b32_e32 v24, 0 +; HAWAII-NEXT: v_mov_b32_e32 v25, 0 +; HAWAII-NEXT: v_mov_b32_e32 v26, 0 +; HAWAII-NEXT: v_mov_b32_e32 v27, 0 +; HAWAII-NEXT: v_mov_b32_e32 v28, 0 +; HAWAII-NEXT: v_mov_b32_e32 v29, 0 +; HAWAII-NEXT: v_mov_b32_e32 v30, 0 +; HAWAII-NEXT: v_writelane_b32 v40, s31, 1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HAWAII-NEXT: v_readlane_b32 s31, v40, 1 +; HAWAII-NEXT: v_readlane_b32 s30, v40, 0 +; HAWAII-NEXT: s_mov_b32 s32, s33 +; HAWAII-NEXT: v_readlane_b32 s4, v40, 2 +; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HAWAII-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HAWAII-NEXT: s_mov_b64 exec, s[6:7] +; HAWAII-NEXT: s_mov_b32 s33, s4 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: no_sibling_call_callee_more_stack_space: ; GFX9: ; %bb.0: ; %entry @@ -813,85 +813,85 @@ entry: ; Have another non-tail in the function define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { -; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 s4, s33 -; CIVI-NEXT: s_mov_b32 s33, s32 -; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CIVI-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; CIVI-NEXT: s_mov_b64 exec, s[6:7] -; CIVI-NEXT: s_addk_i32 s32, 0x400 -; CIVI-NEXT: v_writelane_b32 v42, s4, 2 -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CIVI-NEXT: v_writelane_b32 v42, s30, 0 -; CIVI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CIVI-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; CIVI-NEXT: v_writelane_b32 v42, s31, 1 -; CIVI-NEXT: v_mov_b32_e32 v40, v1 -; CIVI-NEXT: v_mov_b32_e32 v41, v0 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CIVI-NEXT: v_mov_b32_e32 v2, v0 -; CIVI-NEXT: v_mov_b32_e32 v0, v41 -; CIVI-NEXT: v_mov_b32_e32 v1, v40 -; CIVI-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; CIVI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; CIVI-NEXT: v_readlane_b32 s31, v42, 1 -; CIVI-NEXT: v_readlane_b32 s30, v42, 0 -; CIVI-NEXT: s_mov_b32 s32, s33 -; CIVI-NEXT: v_readlane_b32 s6, v42, 2 -; CIVI-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CIVI-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CIVI-NEXT: s_mov_b64 exec, s[8:9] -; CIVI-NEXT: s_mov_b32 s33, s6 -; CIVI-NEXT: s_setpc_b64 s[4:5] +; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_mov_b32 s4, s33 +; FIJI-NEXT: s_mov_b32 s33, s32 +; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; FIJI-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; FIJI-NEXT: s_mov_b64 exec, s[6:7] +; FIJI-NEXT: s_addk_i32 s32, 0x400 +; FIJI-NEXT: v_writelane_b32 v42, s4, 2 +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: v_writelane_b32 v42, s30, 0 +; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; FIJI-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; FIJI-NEXT: v_writelane_b32 v42, s31, 1 +; FIJI-NEXT: v_mov_b32_e32 v40, v1 +; FIJI-NEXT: v_mov_b32_e32 v41, v0 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; FIJI-NEXT: v_mov_b32_e32 v2, v0 +; FIJI-NEXT: v_mov_b32_e32 v0, v41 +; FIJI-NEXT: v_mov_b32_e32 v1, v40 +; FIJI-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; FIJI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 +; FIJI-NEXT: v_readlane_b32 s31, v42, 1 +; FIJI-NEXT: v_readlane_b32 s30, v42, 0 +; FIJI-NEXT: s_mov_b32 s32, s33 +; FIJI-NEXT: v_readlane_b32 s6, v42, 2 +; FIJI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; FIJI-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; FIJI-NEXT: s_mov_b64 exec, s[8:9] +; FIJI-NEXT: s_mov_b32 s33, s6 +; FIJI-NEXT: s_setpc_b64 s[4:5] ; -; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v42, s4, 2 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: v_writelane_b32 v42, s30, 0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v42, s31, 1 -; GCN-NEXT: v_mov_b32_e32 v40, v1 -; GCN-NEXT: v_mov_b32_e32 v41, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v0, v41 -; GCN-NEXT: v_mov_b32_e32 v1, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; GCN-NEXT: v_readlane_b32 s31, v42, 1 -; GCN-NEXT: v_readlane_b32 s30, v42, 0 -; GCN-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: v_readlane_b32 s6, v42, 2 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: s_mov_b32 s33, s6 -; GCN-NEXT: s_setpc_b64 s[4:5] +; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_mov_b32 s4, s33 +; HAWAII-NEXT: s_mov_b32 s33, s32 +; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HAWAII-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; HAWAII-NEXT: s_mov_b64 exec, s[6:7] +; HAWAII-NEXT: s_addk_i32 s32, 0x400 +; HAWAII-NEXT: v_writelane_b32 v42, s4, 2 +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: v_writelane_b32 v42, s30, 0 +; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; HAWAII-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; HAWAII-NEXT: v_writelane_b32 v42, s31, 1 +; HAWAII-NEXT: v_mov_b32_e32 v40, v1 +; HAWAII-NEXT: v_mov_b32_e32 v41, v0 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HAWAII-NEXT: v_mov_b32_e32 v2, v0 +; HAWAII-NEXT: v_mov_b32_e32 v0, v41 +; HAWAII-NEXT: v_mov_b32_e32 v1, v40 +; HAWAII-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; HAWAII-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 +; HAWAII-NEXT: v_readlane_b32 s31, v42, 1 +; HAWAII-NEXT: v_readlane_b32 s30, v42, 0 +; HAWAII-NEXT: s_mov_b32 s32, s33 +; HAWAII-NEXT: v_readlane_b32 s6, v42, 2 +; HAWAII-NEXT: s_or_saveexec_b64 s[8:9], -1 +; HAWAII-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; HAWAII-NEXT: s_mov_b64 exec, s[8:9] +; HAWAII-NEXT: s_mov_b32 s33, s6 +; HAWAII-NEXT: s_setpc_b64 s[4:5] ; ; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: ; GFX9: ; %bb.0: ; %entry @@ -941,43 +941,43 @@ entry: ; Have stack object in caller and stack passed arguments. SP should be ; in same place at function exit. define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { -; CIVI-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CIVI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; CIVI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CIVI-NEXT: v_mov_b32_e32 v34, 9 -; CIVI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 -; CIVI-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; CIVI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; CIVI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[4:5] +; FIJI-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; FIJI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; FIJI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: v_mov_b32_e32 v34, 9 +; FIJI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; FIJI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; FIJI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[4:5] ; -; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v34, 9 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; HAWAII-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; HAWAII-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: v_mov_b32_e32 v34, 9 +; HAWAII-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; HAWAII-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[4:5] ; ; GFX9-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: ; GFX9: ; %bb.0: ; %entry @@ -1006,97 +1006,97 @@ entry: } define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { -; CIVI-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[4:5] -; CIVI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CIVI-NEXT: v_mov_b32_e32 v2, 9 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 -; CIVI-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: v_mov_b32_e32 v2, 0 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; CIVI-NEXT: v_mov_b32_e32 v2, 0 -; CIVI-NEXT: v_mov_b32_e32 v3, 0 -; CIVI-NEXT: v_mov_b32_e32 v4, 0 -; CIVI-NEXT: v_mov_b32_e32 v5, 0 -; CIVI-NEXT: v_mov_b32_e32 v6, 0 -; CIVI-NEXT: v_mov_b32_e32 v7, 0 -; CIVI-NEXT: v_mov_b32_e32 v8, 0 -; CIVI-NEXT: v_mov_b32_e32 v9, 0 -; CIVI-NEXT: v_mov_b32_e32 v10, 0 -; CIVI-NEXT: v_mov_b32_e32 v11, 0 -; CIVI-NEXT: v_mov_b32_e32 v12, 0 -; CIVI-NEXT: v_mov_b32_e32 v13, 0 -; CIVI-NEXT: v_mov_b32_e32 v14, 0 -; CIVI-NEXT: v_mov_b32_e32 v15, 0 -; CIVI-NEXT: v_mov_b32_e32 v16, 0 -; CIVI-NEXT: v_mov_b32_e32 v17, 0 -; CIVI-NEXT: v_mov_b32_e32 v18, 0 -; CIVI-NEXT: v_mov_b32_e32 v19, 0 -; CIVI-NEXT: v_mov_b32_e32 v20, 0 -; CIVI-NEXT: v_mov_b32_e32 v21, 0 -; CIVI-NEXT: v_mov_b32_e32 v22, 0 -; CIVI-NEXT: v_mov_b32_e32 v23, 0 -; CIVI-NEXT: v_mov_b32_e32 v24, 0 -; CIVI-NEXT: v_mov_b32_e32 v25, 0 -; CIVI-NEXT: v_mov_b32_e32 v26, 0 -; CIVI-NEXT: v_mov_b32_e32 v27, 0 -; CIVI-NEXT: v_mov_b32_e32 v28, 0 -; CIVI-NEXT: v_mov_b32_e32 v29, 0 -; CIVI-NEXT: v_mov_b32_e32 v30, 0 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[4:5] +; FIJI-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[4:5] +; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; FIJI-NEXT: v_mov_b32_e32 v2, 9 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, 0 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; FIJI-NEXT: v_mov_b32_e32 v2, 0 +; FIJI-NEXT: v_mov_b32_e32 v3, 0 +; FIJI-NEXT: v_mov_b32_e32 v4, 0 +; FIJI-NEXT: v_mov_b32_e32 v5, 0 +; FIJI-NEXT: v_mov_b32_e32 v6, 0 +; FIJI-NEXT: v_mov_b32_e32 v7, 0 +; FIJI-NEXT: v_mov_b32_e32 v8, 0 +; FIJI-NEXT: v_mov_b32_e32 v9, 0 +; FIJI-NEXT: v_mov_b32_e32 v10, 0 +; FIJI-NEXT: v_mov_b32_e32 v11, 0 +; FIJI-NEXT: v_mov_b32_e32 v12, 0 +; FIJI-NEXT: v_mov_b32_e32 v13, 0 +; FIJI-NEXT: v_mov_b32_e32 v14, 0 +; FIJI-NEXT: v_mov_b32_e32 v15, 0 +; FIJI-NEXT: v_mov_b32_e32 v16, 0 +; FIJI-NEXT: v_mov_b32_e32 v17, 0 +; FIJI-NEXT: v_mov_b32_e32 v18, 0 +; FIJI-NEXT: v_mov_b32_e32 v19, 0 +; FIJI-NEXT: v_mov_b32_e32 v20, 0 +; FIJI-NEXT: v_mov_b32_e32 v21, 0 +; FIJI-NEXT: v_mov_b32_e32 v22, 0 +; FIJI-NEXT: v_mov_b32_e32 v23, 0 +; FIJI-NEXT: v_mov_b32_e32 v24, 0 +; FIJI-NEXT: v_mov_b32_e32 v25, 0 +; FIJI-NEXT: v_mov_b32_e32 v26, 0 +; FIJI-NEXT: v_mov_b32_e32 v27, 0 +; FIJI-NEXT: v_mov_b32_e32 v28, 0 +; FIJI-NEXT: v_mov_b32_e32 v29, 0 +; FIJI-NEXT: v_mov_b32_e32 v30, 0 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[4:5] ; -; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v2, 9 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; HAWAII-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[4:5] +; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; HAWAII-NEXT: v_mov_b32_e32 v2, 9 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, 0 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: v_mov_b32_e32 v2, 0 +; HAWAII-NEXT: v_mov_b32_e32 v3, 0 +; HAWAII-NEXT: v_mov_b32_e32 v4, 0 +; HAWAII-NEXT: v_mov_b32_e32 v5, 0 +; HAWAII-NEXT: v_mov_b32_e32 v6, 0 +; HAWAII-NEXT: v_mov_b32_e32 v7, 0 +; HAWAII-NEXT: v_mov_b32_e32 v8, 0 +; HAWAII-NEXT: v_mov_b32_e32 v9, 0 +; HAWAII-NEXT: v_mov_b32_e32 v10, 0 +; HAWAII-NEXT: v_mov_b32_e32 v11, 0 +; HAWAII-NEXT: v_mov_b32_e32 v12, 0 +; HAWAII-NEXT: v_mov_b32_e32 v13, 0 +; HAWAII-NEXT: v_mov_b32_e32 v14, 0 +; HAWAII-NEXT: v_mov_b32_e32 v15, 0 +; HAWAII-NEXT: v_mov_b32_e32 v16, 0 +; HAWAII-NEXT: v_mov_b32_e32 v17, 0 +; HAWAII-NEXT: v_mov_b32_e32 v18, 0 +; HAWAII-NEXT: v_mov_b32_e32 v19, 0 +; HAWAII-NEXT: v_mov_b32_e32 v20, 0 +; HAWAII-NEXT: v_mov_b32_e32 v21, 0 +; HAWAII-NEXT: v_mov_b32_e32 v22, 0 +; HAWAII-NEXT: v_mov_b32_e32 v23, 0 +; HAWAII-NEXT: v_mov_b32_e32 v24, 0 +; HAWAII-NEXT: v_mov_b32_e32 v25, 0 +; HAWAII-NEXT: v_mov_b32_e32 v26, 0 +; HAWAII-NEXT: v_mov_b32_e32 v27, 0 +; HAWAII-NEXT: v_mov_b32_e32 v28, 0 +; HAWAII-NEXT: v_mov_b32_e32 v29, 0 +; HAWAII-NEXT: v_mov_b32_e32 v30, 0 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[4:5] ; ; GFX9-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: ; GFX9: ; %bb.0: ; %entry @@ -1155,29 +1155,29 @@ entry: ; Do support tail calls with a uniform, but unknown, callee. define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { -; CIVI-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[16:17] -; CIVI-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 -; CIVI-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 -; CIVI-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[16:17] +; FIJI-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[16:17] +; FIJI-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 +; FIJI-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 +; FIJI-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[16:17] ; -; GCN-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[16:17] +; HAWAII-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[16:17] +; HAWAII-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 +; HAWAII-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[16:17] ; ; GFX9-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: ; GFX9: ; %bb.0: ; %entry @@ -1199,187 +1199,187 @@ entry: ; We can't support a tail call to a divergent target. Use a waterfall ; loop around a regular call define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr %func.ptr, i32 %a, i32 %b, i32 %c) #1 { -; CIVI-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 s16, s33 -; CIVI-NEXT: s_mov_b32 s33, s32 -; CIVI-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CIVI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CIVI-NEXT: s_mov_b64 exec, s[18:19] -; CIVI-NEXT: v_writelane_b32 v40, s16, 18 -; CIVI-NEXT: v_writelane_b32 v40, s30, 0 -; CIVI-NEXT: v_writelane_b32 v40, s31, 1 -; CIVI-NEXT: v_writelane_b32 v40, s34, 2 -; CIVI-NEXT: v_writelane_b32 v40, s35, 3 -; CIVI-NEXT: v_writelane_b32 v40, s36, 4 -; CIVI-NEXT: v_writelane_b32 v40, s37, 5 -; CIVI-NEXT: v_writelane_b32 v40, s38, 6 -; CIVI-NEXT: v_writelane_b32 v40, s39, 7 -; CIVI-NEXT: v_writelane_b32 v40, s40, 8 -; CIVI-NEXT: v_writelane_b32 v40, s41, 9 -; CIVI-NEXT: v_writelane_b32 v40, s42, 10 -; CIVI-NEXT: v_writelane_b32 v40, s43, 11 -; CIVI-NEXT: v_writelane_b32 v40, s44, 12 -; CIVI-NEXT: v_writelane_b32 v40, s45, 13 -; CIVI-NEXT: v_writelane_b32 v40, s46, 14 -; CIVI-NEXT: v_writelane_b32 v40, s47, 15 -; CIVI-NEXT: v_writelane_b32 v40, s48, 16 -; CIVI-NEXT: s_mov_b32 s42, s15 -; CIVI-NEXT: s_mov_b32 s43, s14 -; CIVI-NEXT: s_mov_b32 s44, s13 -; CIVI-NEXT: s_mov_b32 s45, s12 -; CIVI-NEXT: s_mov_b64 s[34:35], s[10:11] -; CIVI-NEXT: s_mov_b64 s[36:37], s[8:9] -; CIVI-NEXT: s_mov_b64 s[38:39], s[6:7] -; CIVI-NEXT: s_mov_b64 s[40:41], s[4:5] -; CIVI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; CIVI-NEXT: s_mov_b64 s[46:47], exec -; CIVI-NEXT: s_addk_i32 s32, 0x400 -; CIVI-NEXT: v_writelane_b32 v40, s49, 17 -; CIVI-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; CIVI-NEXT: v_readfirstlane_b32 s16, v0 -; CIVI-NEXT: v_readfirstlane_b32 s17, v1 -; CIVI-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; CIVI-NEXT: s_and_saveexec_b64 s[48:49], vcc -; CIVI-NEXT: s_mov_b64 s[4:5], s[40:41] -; CIVI-NEXT: s_mov_b64 s[6:7], s[38:39] -; CIVI-NEXT: s_mov_b64 s[8:9], s[36:37] -; CIVI-NEXT: s_mov_b64 s[10:11], s[34:35] -; CIVI-NEXT: s_mov_b32 s12, s45 -; CIVI-NEXT: s_mov_b32 s13, s44 -; CIVI-NEXT: s_mov_b32 s14, s43 -; CIVI-NEXT: s_mov_b32 s15, s42 -; CIVI-NEXT: v_mov_b32_e32 v0, v2 -; CIVI-NEXT: v_mov_b32_e32 v1, v3 -; CIVI-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CIVI-NEXT: v_mov_b32_e32 v4, v0 -; CIVI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; CIVI-NEXT: ; implicit-def: $vgpr31 -; CIVI-NEXT: ; implicit-def: $vgpr2 -; CIVI-NEXT: ; implicit-def: $vgpr3 -; CIVI-NEXT: s_xor_b64 exec, exec, s[48:49] -; CIVI-NEXT: s_cbranch_execnz .LBB18_1 -; CIVI-NEXT: ; %bb.2: -; CIVI-NEXT: s_mov_b64 exec, s[46:47] -; CIVI-NEXT: v_mov_b32_e32 v0, v4 -; CIVI-NEXT: v_readlane_b32 s49, v40, 17 -; CIVI-NEXT: v_readlane_b32 s48, v40, 16 -; CIVI-NEXT: v_readlane_b32 s47, v40, 15 -; CIVI-NEXT: v_readlane_b32 s46, v40, 14 -; CIVI-NEXT: v_readlane_b32 s45, v40, 13 -; CIVI-NEXT: v_readlane_b32 s44, v40, 12 -; CIVI-NEXT: v_readlane_b32 s43, v40, 11 -; CIVI-NEXT: v_readlane_b32 s42, v40, 10 -; CIVI-NEXT: v_readlane_b32 s41, v40, 9 -; CIVI-NEXT: v_readlane_b32 s40, v40, 8 -; CIVI-NEXT: v_readlane_b32 s39, v40, 7 -; CIVI-NEXT: v_readlane_b32 s38, v40, 6 -; CIVI-NEXT: v_readlane_b32 s37, v40, 5 -; CIVI-NEXT: v_readlane_b32 s36, v40, 4 -; CIVI-NEXT: v_readlane_b32 s35, v40, 3 -; CIVI-NEXT: v_readlane_b32 s34, v40, 2 -; CIVI-NEXT: v_readlane_b32 s31, v40, 1 -; CIVI-NEXT: v_readlane_b32 s30, v40, 0 -; CIVI-NEXT: s_mov_b32 s32, s33 -; CIVI-NEXT: v_readlane_b32 s4, v40, 18 -; CIVI-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CIVI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CIVI-NEXT: s_mov_b64 exec, s[6:7] -; CIVI-NEXT: s_mov_b32 s33, s4 -; CIVI-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; FIJI-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_mov_b32 s16, s33 +; FIJI-NEXT: s_mov_b32 s33, s32 +; FIJI-NEXT: s_or_saveexec_b64 s[18:19], -1 +; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; FIJI-NEXT: s_mov_b64 exec, s[18:19] +; FIJI-NEXT: v_writelane_b32 v40, s16, 18 +; FIJI-NEXT: v_writelane_b32 v40, s30, 0 +; FIJI-NEXT: v_writelane_b32 v40, s31, 1 +; FIJI-NEXT: v_writelane_b32 v40, s34, 2 +; FIJI-NEXT: v_writelane_b32 v40, s35, 3 +; FIJI-NEXT: v_writelane_b32 v40, s36, 4 +; FIJI-NEXT: v_writelane_b32 v40, s37, 5 +; FIJI-NEXT: v_writelane_b32 v40, s38, 6 +; FIJI-NEXT: v_writelane_b32 v40, s39, 7 +; FIJI-NEXT: v_writelane_b32 v40, s40, 8 +; FIJI-NEXT: v_writelane_b32 v40, s41, 9 +; FIJI-NEXT: v_writelane_b32 v40, s42, 10 +; FIJI-NEXT: v_writelane_b32 v40, s43, 11 +; FIJI-NEXT: v_writelane_b32 v40, s44, 12 +; FIJI-NEXT: v_writelane_b32 v40, s45, 13 +; FIJI-NEXT: v_writelane_b32 v40, s46, 14 +; FIJI-NEXT: v_writelane_b32 v40, s47, 15 +; FIJI-NEXT: v_writelane_b32 v40, s48, 16 +; FIJI-NEXT: s_mov_b32 s42, s15 +; FIJI-NEXT: s_mov_b32 s43, s14 +; FIJI-NEXT: s_mov_b32 s44, s13 +; FIJI-NEXT: s_mov_b32 s45, s12 +; FIJI-NEXT: s_mov_b64 s[34:35], s[10:11] +; FIJI-NEXT: s_mov_b64 s[36:37], s[8:9] +; FIJI-NEXT: s_mov_b64 s[38:39], s[6:7] +; FIJI-NEXT: s_mov_b64 s[40:41], s[4:5] +; FIJI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; FIJI-NEXT: s_mov_b64 s[46:47], exec +; FIJI-NEXT: s_addk_i32 s32, 0x400 +; FIJI-NEXT: v_writelane_b32 v40, s49, 17 +; FIJI-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; FIJI-NEXT: v_readfirstlane_b32 s16, v0 +; FIJI-NEXT: v_readfirstlane_b32 s17, v1 +; FIJI-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] +; FIJI-NEXT: s_and_saveexec_b64 s[48:49], vcc +; FIJI-NEXT: s_mov_b64 s[4:5], s[40:41] +; FIJI-NEXT: s_mov_b64 s[6:7], s[38:39] +; FIJI-NEXT: s_mov_b64 s[8:9], s[36:37] +; FIJI-NEXT: s_mov_b64 s[10:11], s[34:35] +; FIJI-NEXT: s_mov_b32 s12, s45 +; FIJI-NEXT: s_mov_b32 s13, s44 +; FIJI-NEXT: s_mov_b32 s14, s43 +; FIJI-NEXT: s_mov_b32 s15, s42 +; FIJI-NEXT: v_mov_b32_e32 v0, v2 +; FIJI-NEXT: v_mov_b32_e32 v1, v3 +; FIJI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FIJI-NEXT: v_mov_b32_e32 v4, v0 +; FIJI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; FIJI-NEXT: ; implicit-def: $vgpr31 +; FIJI-NEXT: ; implicit-def: $vgpr2 +; FIJI-NEXT: ; implicit-def: $vgpr3 +; FIJI-NEXT: s_xor_b64 exec, exec, s[48:49] +; FIJI-NEXT: s_cbranch_execnz .LBB18_1 +; FIJI-NEXT: ; %bb.2: +; FIJI-NEXT: s_mov_b64 exec, s[46:47] +; FIJI-NEXT: v_mov_b32_e32 v0, v4 +; FIJI-NEXT: v_readlane_b32 s49, v40, 17 +; FIJI-NEXT: v_readlane_b32 s48, v40, 16 +; FIJI-NEXT: v_readlane_b32 s47, v40, 15 +; FIJI-NEXT: v_readlane_b32 s46, v40, 14 +; FIJI-NEXT: v_readlane_b32 s45, v40, 13 +; FIJI-NEXT: v_readlane_b32 s44, v40, 12 +; FIJI-NEXT: v_readlane_b32 s43, v40, 11 +; FIJI-NEXT: v_readlane_b32 s42, v40, 10 +; FIJI-NEXT: v_readlane_b32 s41, v40, 9 +; FIJI-NEXT: v_readlane_b32 s40, v40, 8 +; FIJI-NEXT: v_readlane_b32 s39, v40, 7 +; FIJI-NEXT: v_readlane_b32 s38, v40, 6 +; FIJI-NEXT: v_readlane_b32 s37, v40, 5 +; FIJI-NEXT: v_readlane_b32 s36, v40, 4 +; FIJI-NEXT: v_readlane_b32 s35, v40, 3 +; FIJI-NEXT: v_readlane_b32 s34, v40, 2 +; FIJI-NEXT: v_readlane_b32 s31, v40, 1 +; FIJI-NEXT: v_readlane_b32 s30, v40, 0 +; FIJI-NEXT: s_mov_b32 s32, s33 +; FIJI-NEXT: v_readlane_b32 s4, v40, 18 +; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; FIJI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; FIJI-NEXT: s_mov_b64 exec, s[6:7] +; FIJI-NEXT: s_mov_b32 s33, s4 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: s_setpc_b64 s[30:31] ; -; GCN-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s16, s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 18 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: s_mov_b32 s42, s15 -; GCN-NEXT: s_mov_b32 s43, s14 -; GCN-NEXT: s_mov_b32 s44, s13 -; GCN-NEXT: s_mov_b32 s45, s12 -; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] -; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] -; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] -; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: s_mov_b64 s[46:47], exec -; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s16, v0 -; GCN-NEXT: v_readfirstlane_b32 s17, v1 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] -; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] -; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] -; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s45 -; GCN-NEXT: s_mov_b32 s13, s44 -; GCN-NEXT: s_mov_b32 s14, s43 -; GCN-NEXT: s_mov_b32 s15, s42 -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_xor_b64 exec, exec, s[48:49] -; GCN-NEXT: s_cbranch_execnz .LBB18_1 -; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[46:47] -; GCN-NEXT: v_mov_b32_e32 v0, v4 -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: v_readlane_b32 s4, v40, 18 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; HAWAII-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_mov_b32 s16, s33 +; HAWAII-NEXT: s_mov_b32 s33, s32 +; HAWAII-NEXT: s_or_saveexec_b64 s[18:19], -1 +; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HAWAII-NEXT: s_mov_b64 exec, s[18:19] +; HAWAII-NEXT: v_writelane_b32 v40, s16, 18 +; HAWAII-NEXT: v_writelane_b32 v40, s30, 0 +; HAWAII-NEXT: v_writelane_b32 v40, s31, 1 +; HAWAII-NEXT: v_writelane_b32 v40, s34, 2 +; HAWAII-NEXT: v_writelane_b32 v40, s35, 3 +; HAWAII-NEXT: v_writelane_b32 v40, s36, 4 +; HAWAII-NEXT: v_writelane_b32 v40, s37, 5 +; HAWAII-NEXT: v_writelane_b32 v40, s38, 6 +; HAWAII-NEXT: v_writelane_b32 v40, s39, 7 +; HAWAII-NEXT: v_writelane_b32 v40, s40, 8 +; HAWAII-NEXT: v_writelane_b32 v40, s41, 9 +; HAWAII-NEXT: v_writelane_b32 v40, s42, 10 +; HAWAII-NEXT: v_writelane_b32 v40, s43, 11 +; HAWAII-NEXT: v_writelane_b32 v40, s44, 12 +; HAWAII-NEXT: v_writelane_b32 v40, s45, 13 +; HAWAII-NEXT: v_writelane_b32 v40, s46, 14 +; HAWAII-NEXT: v_writelane_b32 v40, s47, 15 +; HAWAII-NEXT: v_writelane_b32 v40, s48, 16 +; HAWAII-NEXT: s_mov_b32 s42, s15 +; HAWAII-NEXT: s_mov_b32 s43, s14 +; HAWAII-NEXT: s_mov_b32 s44, s13 +; HAWAII-NEXT: s_mov_b32 s45, s12 +; HAWAII-NEXT: s_mov_b64 s[34:35], s[10:11] +; HAWAII-NEXT: s_mov_b64 s[36:37], s[8:9] +; HAWAII-NEXT: s_mov_b64 s[38:39], s[6:7] +; HAWAII-NEXT: s_mov_b64 s[40:41], s[4:5] +; HAWAII-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; HAWAII-NEXT: s_mov_b64 s[46:47], exec +; HAWAII-NEXT: s_addk_i32 s32, 0x400 +; HAWAII-NEXT: v_writelane_b32 v40, s49, 17 +; HAWAII-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; HAWAII-NEXT: v_readfirstlane_b32 s16, v0 +; HAWAII-NEXT: v_readfirstlane_b32 s17, v1 +; HAWAII-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] +; HAWAII-NEXT: s_and_saveexec_b64 s[48:49], vcc +; HAWAII-NEXT: s_mov_b64 s[4:5], s[40:41] +; HAWAII-NEXT: s_mov_b64 s[6:7], s[38:39] +; HAWAII-NEXT: s_mov_b64 s[8:9], s[36:37] +; HAWAII-NEXT: s_mov_b64 s[10:11], s[34:35] +; HAWAII-NEXT: s_mov_b32 s12, s45 +; HAWAII-NEXT: s_mov_b32 s13, s44 +; HAWAII-NEXT: s_mov_b32 s14, s43 +; HAWAII-NEXT: s_mov_b32 s15, s42 +; HAWAII-NEXT: v_mov_b32_e32 v0, v2 +; HAWAII-NEXT: v_mov_b32_e32 v1, v3 +; HAWAII-NEXT: s_swappc_b64 s[30:31], s[16:17] +; HAWAII-NEXT: v_mov_b32_e32 v4, v0 +; HAWAII-NEXT: ; implicit-def: $vgpr0_vgpr1 +; HAWAII-NEXT: ; implicit-def: $vgpr31 +; HAWAII-NEXT: ; implicit-def: $vgpr2 +; HAWAII-NEXT: ; implicit-def: $vgpr3 +; HAWAII-NEXT: s_xor_b64 exec, exec, s[48:49] +; HAWAII-NEXT: s_cbranch_execnz .LBB18_1 +; HAWAII-NEXT: ; %bb.2: +; HAWAII-NEXT: s_mov_b64 exec, s[46:47] +; HAWAII-NEXT: v_mov_b32_e32 v0, v4 +; HAWAII-NEXT: v_readlane_b32 s49, v40, 17 +; HAWAII-NEXT: v_readlane_b32 s48, v40, 16 +; HAWAII-NEXT: v_readlane_b32 s47, v40, 15 +; HAWAII-NEXT: v_readlane_b32 s46, v40, 14 +; HAWAII-NEXT: v_readlane_b32 s45, v40, 13 +; HAWAII-NEXT: v_readlane_b32 s44, v40, 12 +; HAWAII-NEXT: v_readlane_b32 s43, v40, 11 +; HAWAII-NEXT: v_readlane_b32 s42, v40, 10 +; HAWAII-NEXT: v_readlane_b32 s41, v40, 9 +; HAWAII-NEXT: v_readlane_b32 s40, v40, 8 +; HAWAII-NEXT: v_readlane_b32 s39, v40, 7 +; HAWAII-NEXT: v_readlane_b32 s38, v40, 6 +; HAWAII-NEXT: v_readlane_b32 s37, v40, 5 +; HAWAII-NEXT: v_readlane_b32 s36, v40, 4 +; HAWAII-NEXT: v_readlane_b32 s35, v40, 3 +; HAWAII-NEXT: v_readlane_b32 s34, v40, 2 +; HAWAII-NEXT: v_readlane_b32 s31, v40, 1 +; HAWAII-NEXT: v_readlane_b32 s30, v40, 0 +; HAWAII-NEXT: s_mov_b32 s32, s33 +; HAWAII-NEXT: v_readlane_b32 s4, v40, 18 +; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HAWAII-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HAWAII-NEXT: s_mov_b64 exec, s[6:7] +; HAWAII-NEXT: s_mov_b32 s33, s4 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32: ; GFX9: ; %bb.0: ; %entry @@ -1480,53 +1480,53 @@ entry: declare hidden void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) align 16, ptr addrspace(5) byval([2 x i64])) define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { -; CIVI-LABEL: sibling_call_fastcc_multi_byval: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_mov_b32_e32 v1, 9 -; CIVI-NEXT: v_mov_b32_e32 v2, 0 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; CIVI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; CIVI-NEXT: s_getpc_b64 s[16:17] -; CIVI-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 -; CIVI-NEXT: s_setpc_b64 s[16:17] +; FIJI-LABEL: sibling_call_fastcc_multi_byval: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, 9 +; FIJI-NEXT: v_mov_b32_e32 v2, 0 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; FIJI-NEXT: s_getpc_b64 s[16:17] +; FIJI-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 +; FIJI-NEXT: s_setpc_b64 s[16:17] ; -; GCN-LABEL: sibling_call_fastcc_multi_byval: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, 9 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[16:17] +; HAWAII-LABEL: sibling_call_fastcc_multi_byval: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v1, 9 +; HAWAII-NEXT: v_mov_b32_e32 v2, 0 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: s_getpc_b64 s[16:17] +; HAWAII-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 +; HAWAII-NEXT: s_setpc_b64 s[16:17] ; ; GFX9-LABEL: sibling_call_fastcc_multi_byval: ; GFX9: ; %bb.0: ; %entry @@ -1564,103 +1564,103 @@ declare hidden void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([ ; Callee has a byval and non-byval stack passed argument define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 { -; CIVI-LABEL: sibling_call_byval_and_stack_passed: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_mov_b32_e32 v1, 9 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; CIVI-NEXT: v_mov_b32_e32 v1, 0 -; CIVI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 -; CIVI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 -; CIVI-NEXT: v_mov_b32_e32 v0, 0 -; CIVI-NEXT: v_mov_b32_e32 v1, 0 -; CIVI-NEXT: v_mov_b32_e32 v2, 0 -; CIVI-NEXT: v_mov_b32_e32 v3, 0 -; CIVI-NEXT: v_mov_b32_e32 v4, 0 -; CIVI-NEXT: v_mov_b32_e32 v5, 0 -; CIVI-NEXT: v_mov_b32_e32 v6, 0 -; CIVI-NEXT: v_mov_b32_e32 v7, 0 -; CIVI-NEXT: v_mov_b32_e32 v8, 0 -; CIVI-NEXT: v_mov_b32_e32 v9, 0 -; CIVI-NEXT: v_mov_b32_e32 v10, 0 -; CIVI-NEXT: v_mov_b32_e32 v11, 0 -; CIVI-NEXT: v_mov_b32_e32 v12, 0 -; CIVI-NEXT: v_mov_b32_e32 v13, 0 -; CIVI-NEXT: v_mov_b32_e32 v14, 0 -; CIVI-NEXT: v_mov_b32_e32 v15, 0 -; CIVI-NEXT: v_mov_b32_e32 v16, 0 -; CIVI-NEXT: v_mov_b32_e32 v17, 0 -; CIVI-NEXT: v_mov_b32_e32 v18, 0 -; CIVI-NEXT: v_mov_b32_e32 v19, 0 -; CIVI-NEXT: v_mov_b32_e32 v20, 0 -; CIVI-NEXT: v_mov_b32_e32 v21, 0 -; CIVI-NEXT: v_mov_b32_e32 v22, 0 -; CIVI-NEXT: v_mov_b32_e32 v23, 0 -; CIVI-NEXT: v_mov_b32_e32 v24, 0 -; CIVI-NEXT: v_mov_b32_e32 v25, 0 -; CIVI-NEXT: v_mov_b32_e32 v26, 0 -; CIVI-NEXT: v_mov_b32_e32 v27, 0 -; CIVI-NEXT: v_mov_b32_e32 v28, 0 -; CIVI-NEXT: v_mov_b32_e32 v29, 0 -; CIVI-NEXT: v_mov_b32_e32 v30, 0 -; CIVI-NEXT: s_getpc_b64 s[16:17] -; CIVI-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 -; CIVI-NEXT: s_setpc_b64 s[16:17] +; FIJI-LABEL: sibling_call_byval_and_stack_passed: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, 9 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; FIJI-NEXT: v_mov_b32_e32 v1, 0 +; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 +; FIJI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; FIJI-NEXT: v_mov_b32_e32 v0, 0 +; FIJI-NEXT: v_mov_b32_e32 v1, 0 +; FIJI-NEXT: v_mov_b32_e32 v2, 0 +; FIJI-NEXT: v_mov_b32_e32 v3, 0 +; FIJI-NEXT: v_mov_b32_e32 v4, 0 +; FIJI-NEXT: v_mov_b32_e32 v5, 0 +; FIJI-NEXT: v_mov_b32_e32 v6, 0 +; FIJI-NEXT: v_mov_b32_e32 v7, 0 +; FIJI-NEXT: v_mov_b32_e32 v8, 0 +; FIJI-NEXT: v_mov_b32_e32 v9, 0 +; FIJI-NEXT: v_mov_b32_e32 v10, 0 +; FIJI-NEXT: v_mov_b32_e32 v11, 0 +; FIJI-NEXT: v_mov_b32_e32 v12, 0 +; FIJI-NEXT: v_mov_b32_e32 v13, 0 +; FIJI-NEXT: v_mov_b32_e32 v14, 0 +; FIJI-NEXT: v_mov_b32_e32 v15, 0 +; FIJI-NEXT: v_mov_b32_e32 v16, 0 +; FIJI-NEXT: v_mov_b32_e32 v17, 0 +; FIJI-NEXT: v_mov_b32_e32 v18, 0 +; FIJI-NEXT: v_mov_b32_e32 v19, 0 +; FIJI-NEXT: v_mov_b32_e32 v20, 0 +; FIJI-NEXT: v_mov_b32_e32 v21, 0 +; FIJI-NEXT: v_mov_b32_e32 v22, 0 +; FIJI-NEXT: v_mov_b32_e32 v23, 0 +; FIJI-NEXT: v_mov_b32_e32 v24, 0 +; FIJI-NEXT: v_mov_b32_e32 v25, 0 +; FIJI-NEXT: v_mov_b32_e32 v26, 0 +; FIJI-NEXT: v_mov_b32_e32 v27, 0 +; FIJI-NEXT: v_mov_b32_e32 v28, 0 +; FIJI-NEXT: v_mov_b32_e32 v29, 0 +; FIJI-NEXT: v_mov_b32_e32 v30, 0 +; FIJI-NEXT: s_getpc_b64 s[16:17] +; FIJI-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 +; FIJI-NEXT: s_setpc_b64 s[16:17] ; -; GCN-LABEL: sibling_call_byval_and_stack_passed: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, 9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[16:17] +; HAWAII-LABEL: sibling_call_byval_and_stack_passed: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v1, 9 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; HAWAII-NEXT: v_mov_b32_e32 v1, 0 +; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 +; HAWAII-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; HAWAII-NEXT: v_mov_b32_e32 v0, 0 +; HAWAII-NEXT: v_mov_b32_e32 v1, 0 +; HAWAII-NEXT: v_mov_b32_e32 v2, 0 +; HAWAII-NEXT: v_mov_b32_e32 v3, 0 +; HAWAII-NEXT: v_mov_b32_e32 v4, 0 +; HAWAII-NEXT: v_mov_b32_e32 v5, 0 +; HAWAII-NEXT: v_mov_b32_e32 v6, 0 +; HAWAII-NEXT: v_mov_b32_e32 v7, 0 +; HAWAII-NEXT: v_mov_b32_e32 v8, 0 +; HAWAII-NEXT: v_mov_b32_e32 v9, 0 +; HAWAII-NEXT: v_mov_b32_e32 v10, 0 +; HAWAII-NEXT: v_mov_b32_e32 v11, 0 +; HAWAII-NEXT: v_mov_b32_e32 v12, 0 +; HAWAII-NEXT: v_mov_b32_e32 v13, 0 +; HAWAII-NEXT: v_mov_b32_e32 v14, 0 +; HAWAII-NEXT: v_mov_b32_e32 v15, 0 +; HAWAII-NEXT: v_mov_b32_e32 v16, 0 +; HAWAII-NEXT: v_mov_b32_e32 v17, 0 +; HAWAII-NEXT: v_mov_b32_e32 v18, 0 +; HAWAII-NEXT: v_mov_b32_e32 v19, 0 +; HAWAII-NEXT: v_mov_b32_e32 v20, 0 +; HAWAII-NEXT: v_mov_b32_e32 v21, 0 +; HAWAII-NEXT: v_mov_b32_e32 v22, 0 +; HAWAII-NEXT: v_mov_b32_e32 v23, 0 +; HAWAII-NEXT: v_mov_b32_e32 v24, 0 +; HAWAII-NEXT: v_mov_b32_e32 v25, 0 +; HAWAII-NEXT: v_mov_b32_e32 v26, 0 +; HAWAII-NEXT: v_mov_b32_e32 v27, 0 +; HAWAII-NEXT: v_mov_b32_e32 v28, 0 +; HAWAII-NEXT: v_mov_b32_e32 v29, 0 +; HAWAII-NEXT: v_mov_b32_e32 v30, 0 +; HAWAII-NEXT: s_getpc_b64 s[16:17] +; HAWAII-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 +; HAWAII-NEXT: s_setpc_b64 s[16:17] ; ; GFX9-LABEL: sibling_call_byval_and_stack_passed: ; GFX9: ; %bb.0: ; %entry @@ -1720,21 +1720,21 @@ entry: declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0) define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 { -; CIVI-LABEL: sibling_call_i64_fastcc_i64: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[16:17] -; CIVI-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 -; CIVI-NEXT: s_setpc_b64 s[16:17] +; FIJI-LABEL: sibling_call_i64_fastcc_i64: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[16:17] +; FIJI-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 +; FIJI-NEXT: s_setpc_b64 s[16:17] ; -; GCN-LABEL: sibling_call_i64_fastcc_i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[16:17] +; HAWAII-LABEL: sibling_call_i64_fastcc_i64: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[16:17] +; HAWAII-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 +; HAWAII-NEXT: s_setpc_b64 s[16:17] ; ; GFX9-LABEL: sibling_call_i64_fastcc_i64: ; GFX9: ; %bb.0: ; %entry @@ -1751,21 +1751,21 @@ entry: declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0) define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 { -; CIVI-LABEL: sibling_call_p1i8_fastcc_p1i8: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[16:17] -; CIVI-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 -; CIVI-NEXT: s_setpc_b64 s[16:17] +; FIJI-LABEL: sibling_call_p1i8_fastcc_p1i8: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[16:17] +; FIJI-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 +; FIJI-NEXT: s_setpc_b64 s[16:17] ; -; GCN-LABEL: sibling_call_p1i8_fastcc_p1i8: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[16:17] +; HAWAII-LABEL: sibling_call_p1i8_fastcc_p1i8: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[16:17] +; HAWAII-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 +; HAWAII-NEXT: s_setpc_b64 s[16:17] ; ; GFX9-LABEL: sibling_call_p1i8_fastcc_p1i8: ; GFX9: ; %bb.0: ; %entry @@ -1782,21 +1782,21 @@ entry: declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0) define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 { -; CIVI-LABEL: sibling_call_i16_fastcc_i16: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[16:17] -; CIVI-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 -; CIVI-NEXT: s_setpc_b64 s[16:17] +; FIJI-LABEL: sibling_call_i16_fastcc_i16: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[16:17] +; FIJI-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 +; FIJI-NEXT: s_setpc_b64 s[16:17] ; -; GCN-LABEL: sibling_call_i16_fastcc_i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[16:17] +; HAWAII-LABEL: sibling_call_i16_fastcc_i16: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[16:17] +; HAWAII-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 +; HAWAII-NEXT: s_setpc_b64 s[16:17] ; ; GFX9-LABEL: sibling_call_i16_fastcc_i16: ; GFX9: ; %bb.0: ; %entry @@ -1813,21 +1813,21 @@ entry: declare hidden fastcc half @f16_fastcc_f16(half %arg0) define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 { -; CIVI-LABEL: sibling_call_f16_fastcc_f16: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[16:17] -; CIVI-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 -; CIVI-NEXT: s_setpc_b64 s[16:17] +; FIJI-LABEL: sibling_call_f16_fastcc_f16: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[16:17] +; FIJI-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 +; FIJI-NEXT: s_setpc_b64 s[16:17] ; -; GCN-LABEL: sibling_call_f16_fastcc_f16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[16:17] +; HAWAII-LABEL: sibling_call_f16_fastcc_f16: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[16:17] +; HAWAII-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 +; HAWAII-NEXT: s_setpc_b64 s[16:17] ; ; GFX9-LABEL: sibling_call_f16_fastcc_f16: ; GFX9: ; %bb.0: ; %entry @@ -1844,21 +1844,21 @@ entry: declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0) define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 { -; CIVI-LABEL: sibling_call_v3i16_fastcc_v3i16: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[16:17] -; CIVI-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 -; CIVI-NEXT: s_setpc_b64 s[16:17] +; FIJI-LABEL: sibling_call_v3i16_fastcc_v3i16: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[16:17] +; FIJI-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 +; FIJI-NEXT: s_setpc_b64 s[16:17] ; -; GCN-LABEL: sibling_call_v3i16_fastcc_v3i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[16:17] +; HAWAII-LABEL: sibling_call_v3i16_fastcc_v3i16: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[16:17] +; HAWAII-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 +; HAWAII-NEXT: s_setpc_b64 s[16:17] ; ; GFX9-LABEL: sibling_call_v3i16_fastcc_v3i16: ; GFX9: ; %bb.0: ; %entry @@ -1875,21 +1875,21 @@ entry: declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0) define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 { -; CIVI-LABEL: sibling_call_v4i16_fastcc_v4i16: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[16:17] -; CIVI-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 -; CIVI-NEXT: s_setpc_b64 s[16:17] +; FIJI-LABEL: sibling_call_v4i16_fastcc_v4i16: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[16:17] +; FIJI-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 +; FIJI-NEXT: s_setpc_b64 s[16:17] ; -; GCN-LABEL: sibling_call_v4i16_fastcc_v4i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[16:17] +; HAWAII-LABEL: sibling_call_v4i16_fastcc_v4i16: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[16:17] +; HAWAII-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 +; HAWAII-NEXT: s_setpc_b64 s[16:17] ; ; GFX9-LABEL: sibling_call_v4i16_fastcc_v4i16: ; GFX9: ; %bb.0: ; %entry @@ -1906,21 +1906,21 @@ entry: declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0) define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 { -; CIVI-LABEL: sibling_call_v2i64_fastcc_v2i64: -; CIVI: ; %bb.0: ; %entry -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_getpc_b64 s[16:17] -; CIVI-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 -; CIVI-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 -; CIVI-NEXT: s_setpc_b64 s[16:17] +; FIJI-LABEL: sibling_call_v2i64_fastcc_v2i64: +; FIJI: ; %bb.0: ; %entry +; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FIJI-NEXT: s_getpc_b64 s[16:17] +; FIJI-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 +; FIJI-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 +; FIJI-NEXT: s_setpc_b64 s[16:17] ; -; GCN-LABEL: sibling_call_v2i64_fastcc_v2i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[16:17] +; HAWAII-LABEL: sibling_call_v2i64_fastcc_v2i64: +; HAWAII: ; %bb.0: ; %entry +; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HAWAII-NEXT: s_getpc_b64 s[16:17] +; HAWAII-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 +; HAWAII-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 +; HAWAII-NEXT: s_setpc_b64 s[16:17] ; ; GFX9-LABEL: sibling_call_v2i64_fastcc_v2i64: ; GFX9: ; %bb.0: ; %entry From c2b1c505149ff118dd122096b08b4beb90bc1ad9 Mon Sep 17 00:00:00 2001 From: easyonaadit Date: Mon, 27 Jan 2025 13:51:18 +0530 Subject: [PATCH 4/4] Updated Runlines --- llvm/test/CodeGen/AMDGPU/nested-calls.ll | 10 +- llvm/test/CodeGen/AMDGPU/sibling-call.ll | 1686 ++++++---------------- 2 files changed, 419 insertions(+), 1277 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll index 31e520ce74d98..1821872b82c0a 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s +; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; Test calls when called by other callable functions rather than ; kernels. @@ -89,3 +89,7 @@ define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind noinline } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; FIJI: {{.*}} +; GFX9: {{.*}} +; HAWAII: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index c79d60c2a951f..cd7f0c62b0011 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FIJI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=HAWAII %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s target datalayout = "A5" ; FIXME: Why is this commuted only sometimes? @@ -62,76 +62,32 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { } define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { -; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_setpc_b64 s[4:5] -; -; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: s_setpc_b64 s[4:5] -; -; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret i32 %ret } define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { -; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; FIJI-NEXT: v_mov_b32_e32 v2, 9 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_setpc_b64 s[4:5] -; -; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HAWAII-NEXT: v_mov_b32_e32 v2, 9 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; HAWAII-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_setpc_b64 s[4:5] -; -; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 9 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -141,41 +97,17 @@ entry: } define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { -; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 -; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; FIJI-NEXT: v_mov_b32_e32 v2, 9 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_setpc_b64 s[4:5] -; -; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 -; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HAWAII-NEXT: v_mov_b32_e32 v2, 9 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; HAWAII-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_setpc_b64 s[4:5] -; -; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 9 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -185,35 +117,15 @@ entry: } define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { -; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_setpc_b64 s[4:5] -; -; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: s_setpc_b64 s[4:5] -; -; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) ret void @@ -312,95 +224,35 @@ define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) b ; Tail call disallowed with byval in parent. define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, ptr addrspace(5) byval(i32) %b.byval, i32 %c) #1 { -; FIJI-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_mov_b32 s4, s33 -; FIJI-NEXT: s_mov_b32 s33, s32 -; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 -; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; FIJI-NEXT: s_mov_b64 exec, s[6:7] -; FIJI-NEXT: buffer_load_dword v1, off, s[0:3], s33 -; FIJI-NEXT: v_writelane_b32 v40, s4, 2 -; FIJI-NEXT: s_addk_i32 s32, 0x400 -; FIJI-NEXT: v_writelane_b32 v40, s30, 0 -; FIJI-NEXT: v_writelane_b32 v40, s31, 1 -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; FIJI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; FIJI-NEXT: v_readlane_b32 s31, v40, 1 -; FIJI-NEXT: v_readlane_b32 s30, v40, 0 -; FIJI-NEXT: s_mov_b32 s32, s33 -; FIJI-NEXT: v_readlane_b32 s4, v40, 2 -; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 -; FIJI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; FIJI-NEXT: s_mov_b64 exec, s[6:7] -; FIJI-NEXT: s_mov_b32 s33, s4 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: s_setpc_b64 s[30:31] -; -; HAWAII-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_mov_b32 s4, s33 -; HAWAII-NEXT: s_mov_b32 s33, s32 -; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 -; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; HAWAII-NEXT: s_mov_b64 exec, s[6:7] -; HAWAII-NEXT: buffer_load_dword v1, off, s[0:3], s33 -; HAWAII-NEXT: v_writelane_b32 v40, s4, 2 -; HAWAII-NEXT: s_addk_i32 s32, 0x400 -; HAWAII-NEXT: v_writelane_b32 v40, s30, 0 -; HAWAII-NEXT: v_writelane_b32 v40, s31, 1 -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; HAWAII-NEXT: s_swappc_b64 s[30:31], s[4:5] -; HAWAII-NEXT: v_readlane_b32 s31, v40, 1 -; HAWAII-NEXT: v_readlane_b32 s30, v40, 0 -; HAWAII-NEXT: s_mov_b32 s32, s33 -; HAWAII-NEXT: v_readlane_b32 s4, v40, 2 -; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 -; HAWAII-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; HAWAII-NEXT: s_mov_b64 exec, s[6:7] -; HAWAII-NEXT: s_mov_b32 s33, s4 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 -; GFX9-NEXT: v_writelane_b32 v40, s4, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) %b.byval) ret i32 %ret @@ -410,38 +262,16 @@ entry: ; usage of incoming arguments must be <= the outgoing stack ; arguments. define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 { -; FIJI-LABEL: sibling_call_i32_fastcc_i32_byval_i32: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; FIJI-NEXT: s_setpc_b64 s[4:5] -; -; HAWAII-LABEL: sibling_call_i32_fastcc_i32_byval_i32: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; HAWAII-NEXT: s_setpc_b64 s[4:5] -; -; GFX9-LABEL: sibling_call_i32_fastcc_i32_byval_i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) inttoptr (i32 16 to ptr addrspace(5))) ret i32 %ret @@ -491,124 +321,48 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l ; FIXME: Why load and store same location for stack args? define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { -; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; FIJI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; FIJI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; FIJI-NEXT: s_waitcnt vmcnt(2) -; FIJI-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; FIJI-NEXT: s_waitcnt vmcnt(2) -; FIJI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; FIJI-NEXT: s_waitcnt vmcnt(2) -; FIJI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_setpc_b64 s[4:5] -; -; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; HAWAII-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; HAWAII-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HAWAII-NEXT: s_waitcnt vmcnt(2) -; HAWAII-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; HAWAII-NEXT: s_waitcnt vmcnt(2) -; HAWAII-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; HAWAII-NEXT: s_waitcnt vmcnt(2) -; HAWAII-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: s_setpc_b64 s[4:5] -; -; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) ret i32 %ret } define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { -; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; FIJI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; FIJI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; FIJI-NEXT: v_mov_b32_e32 v34, 9 -; FIJI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; FIJI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; FIJI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_setpc_b64 s[4:5] -; -; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; HAWAII-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; HAWAII-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HAWAII-NEXT: v_mov_b32_e32 v34, 9 -; HAWAII-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; HAWAII-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; HAWAII-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: s_setpc_b64 s[4:5] -; -; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v34, 9 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v34, 9 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -621,191 +375,67 @@ entry: ; don't do a tail call. ; TODO: Do we really need this restriction? define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { -; FIJI-LABEL: no_sibling_call_callee_more_stack_space: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_mov_b32 s4, s33 -; FIJI-NEXT: s_mov_b32 s33, s32 -; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 -; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; FIJI-NEXT: s_mov_b64 exec, s[6:7] -; FIJI-NEXT: s_addk_i32 s32, 0x400 -; FIJI-NEXT: v_writelane_b32 v40, s4, 2 -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; FIJI-NEXT: v_mov_b32_e32 v2, 0 -; FIJI-NEXT: v_writelane_b32 v40, s30, 0 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; FIJI-NEXT: v_mov_b32_e32 v2, 0 -; FIJI-NEXT: v_mov_b32_e32 v3, 0 -; FIJI-NEXT: v_mov_b32_e32 v4, 0 -; FIJI-NEXT: v_mov_b32_e32 v5, 0 -; FIJI-NEXT: v_mov_b32_e32 v6, 0 -; FIJI-NEXT: v_mov_b32_e32 v7, 0 -; FIJI-NEXT: v_mov_b32_e32 v8, 0 -; FIJI-NEXT: v_mov_b32_e32 v9, 0 -; FIJI-NEXT: v_mov_b32_e32 v10, 0 -; FIJI-NEXT: v_mov_b32_e32 v11, 0 -; FIJI-NEXT: v_mov_b32_e32 v12, 0 -; FIJI-NEXT: v_mov_b32_e32 v13, 0 -; FIJI-NEXT: v_mov_b32_e32 v14, 0 -; FIJI-NEXT: v_mov_b32_e32 v15, 0 -; FIJI-NEXT: v_mov_b32_e32 v16, 0 -; FIJI-NEXT: v_mov_b32_e32 v17, 0 -; FIJI-NEXT: v_mov_b32_e32 v18, 0 -; FIJI-NEXT: v_mov_b32_e32 v19, 0 -; FIJI-NEXT: v_mov_b32_e32 v20, 0 -; FIJI-NEXT: v_mov_b32_e32 v21, 0 -; FIJI-NEXT: v_mov_b32_e32 v22, 0 -; FIJI-NEXT: v_mov_b32_e32 v23, 0 -; FIJI-NEXT: v_mov_b32_e32 v24, 0 -; FIJI-NEXT: v_mov_b32_e32 v25, 0 -; FIJI-NEXT: v_mov_b32_e32 v26, 0 -; FIJI-NEXT: v_mov_b32_e32 v27, 0 -; FIJI-NEXT: v_mov_b32_e32 v28, 0 -; FIJI-NEXT: v_mov_b32_e32 v29, 0 -; FIJI-NEXT: v_mov_b32_e32 v30, 0 -; FIJI-NEXT: v_writelane_b32 v40, s31, 1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; FIJI-NEXT: v_readlane_b32 s31, v40, 1 -; FIJI-NEXT: v_readlane_b32 s30, v40, 0 -; FIJI-NEXT: s_mov_b32 s32, s33 -; FIJI-NEXT: v_readlane_b32 s4, v40, 2 -; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 -; FIJI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; FIJI-NEXT: s_mov_b64 exec, s[6:7] -; FIJI-NEXT: s_mov_b32 s33, s4 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: s_setpc_b64 s[30:31] -; -; HAWAII-LABEL: no_sibling_call_callee_more_stack_space: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_mov_b32 s4, s33 -; HAWAII-NEXT: s_mov_b32 s33, s32 -; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 -; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; HAWAII-NEXT: s_mov_b64 exec, s[6:7] -; HAWAII-NEXT: s_addk_i32 s32, 0x400 -; HAWAII-NEXT: v_writelane_b32 v40, s4, 2 -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HAWAII-NEXT: v_mov_b32_e32 v2, 0 -; HAWAII-NEXT: v_writelane_b32 v40, s30, 0 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; HAWAII-NEXT: v_mov_b32_e32 v2, 0 -; HAWAII-NEXT: v_mov_b32_e32 v3, 0 -; HAWAII-NEXT: v_mov_b32_e32 v4, 0 -; HAWAII-NEXT: v_mov_b32_e32 v5, 0 -; HAWAII-NEXT: v_mov_b32_e32 v6, 0 -; HAWAII-NEXT: v_mov_b32_e32 v7, 0 -; HAWAII-NEXT: v_mov_b32_e32 v8, 0 -; HAWAII-NEXT: v_mov_b32_e32 v9, 0 -; HAWAII-NEXT: v_mov_b32_e32 v10, 0 -; HAWAII-NEXT: v_mov_b32_e32 v11, 0 -; HAWAII-NEXT: v_mov_b32_e32 v12, 0 -; HAWAII-NEXT: v_mov_b32_e32 v13, 0 -; HAWAII-NEXT: v_mov_b32_e32 v14, 0 -; HAWAII-NEXT: v_mov_b32_e32 v15, 0 -; HAWAII-NEXT: v_mov_b32_e32 v16, 0 -; HAWAII-NEXT: v_mov_b32_e32 v17, 0 -; HAWAII-NEXT: v_mov_b32_e32 v18, 0 -; HAWAII-NEXT: v_mov_b32_e32 v19, 0 -; HAWAII-NEXT: v_mov_b32_e32 v20, 0 -; HAWAII-NEXT: v_mov_b32_e32 v21, 0 -; HAWAII-NEXT: v_mov_b32_e32 v22, 0 -; HAWAII-NEXT: v_mov_b32_e32 v23, 0 -; HAWAII-NEXT: v_mov_b32_e32 v24, 0 -; HAWAII-NEXT: v_mov_b32_e32 v25, 0 -; HAWAII-NEXT: v_mov_b32_e32 v26, 0 -; HAWAII-NEXT: v_mov_b32_e32 v27, 0 -; HAWAII-NEXT: v_mov_b32_e32 v28, 0 -; HAWAII-NEXT: v_mov_b32_e32 v29, 0 -; HAWAII-NEXT: v_mov_b32_e32 v30, 0 -; HAWAII-NEXT: v_writelane_b32 v40, s31, 1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: s_swappc_b64 s[30:31], s[4:5] -; HAWAII-NEXT: v_readlane_b32 s31, v40, 1 -; HAWAII-NEXT: v_readlane_b32 s30, v40, 0 -; HAWAII-NEXT: s_mov_b32 s32, s33 -; HAWAII-NEXT: v_readlane_b32 s4, v40, 2 -; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 -; HAWAII-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; HAWAII-NEXT: s_mov_b64 exec, s[6:7] -; HAWAII-NEXT: s_mov_b32 s33, s4 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: no_sibling_call_callee_more_stack_space: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s4, 2 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 -; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mov_b32_e32 v16, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, 0 -; GFX9-NEXT: v_mov_b32_e32 v18, 0 -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_mov_b32_e32 v20, 0 -; GFX9-NEXT: v_mov_b32_e32 v21, 0 -; GFX9-NEXT: v_mov_b32_e32 v22, 0 -; GFX9-NEXT: v_mov_b32_e32 v23, 0 -; GFX9-NEXT: v_mov_b32_e32 v24, 0 -; GFX9-NEXT: v_mov_b32_e32 v25, 0 -; GFX9-NEXT: v_mov_b32_e32 v26, 0 -; GFX9-NEXT: v_mov_b32_e32 v27, 0 -; GFX9-NEXT: v_mov_b32_e32 v28, 0 -; GFX9-NEXT: v_mov_b32_e32 v29, 0 -; GFX9-NEXT: v_mov_b32_e32 v30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: no_sibling_call_callee_more_stack_space: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) ret i32 %ret @@ -813,125 +443,45 @@ entry: ; Have another non-tail in the function define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { -; FIJI-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_mov_b32 s4, s33 -; FIJI-NEXT: s_mov_b32 s33, s32 -; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 -; FIJI-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; FIJI-NEXT: s_mov_b64 exec, s[6:7] -; FIJI-NEXT: s_addk_i32 s32, 0x400 -; FIJI-NEXT: v_writelane_b32 v42, s4, 2 -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; FIJI-NEXT: v_writelane_b32 v42, s30, 0 -; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; FIJI-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; FIJI-NEXT: v_writelane_b32 v42, s31, 1 -; FIJI-NEXT: v_mov_b32_e32 v40, v1 -; FIJI-NEXT: v_mov_b32_e32 v41, v0 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; FIJI-NEXT: v_mov_b32_e32 v2, v0 -; FIJI-NEXT: v_mov_b32_e32 v0, v41 -; FIJI-NEXT: v_mov_b32_e32 v1, v40 -; FIJI-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; FIJI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; FIJI-NEXT: v_readlane_b32 s31, v42, 1 -; FIJI-NEXT: v_readlane_b32 s30, v42, 0 -; FIJI-NEXT: s_mov_b32 s32, s33 -; FIJI-NEXT: v_readlane_b32 s6, v42, 2 -; FIJI-NEXT: s_or_saveexec_b64 s[8:9], -1 -; FIJI-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; FIJI-NEXT: s_mov_b64 exec, s[8:9] -; FIJI-NEXT: s_mov_b32 s33, s6 -; FIJI-NEXT: s_setpc_b64 s[4:5] -; -; HAWAII-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_mov_b32 s4, s33 -; HAWAII-NEXT: s_mov_b32 s33, s32 -; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 -; HAWAII-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; HAWAII-NEXT: s_mov_b64 exec, s[6:7] -; HAWAII-NEXT: s_addk_i32 s32, 0x400 -; HAWAII-NEXT: v_writelane_b32 v42, s4, 2 -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HAWAII-NEXT: v_writelane_b32 v42, s30, 0 -; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; HAWAII-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; HAWAII-NEXT: v_writelane_b32 v42, s31, 1 -; HAWAII-NEXT: v_mov_b32_e32 v40, v1 -; HAWAII-NEXT: v_mov_b32_e32 v41, v0 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: s_swappc_b64 s[30:31], s[4:5] -; HAWAII-NEXT: v_mov_b32_e32 v2, v0 -; HAWAII-NEXT: v_mov_b32_e32 v0, v41 -; HAWAII-NEXT: v_mov_b32_e32 v1, v40 -; HAWAII-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; HAWAII-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; HAWAII-NEXT: v_readlane_b32 s31, v42, 1 -; HAWAII-NEXT: v_readlane_b32 s30, v42, 0 -; HAWAII-NEXT: s_mov_b32 s32, s33 -; HAWAII-NEXT: v_readlane_b32 s6, v42, 2 -; HAWAII-NEXT: s_or_saveexec_b64 s[8:9], -1 -; HAWAII-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; HAWAII-NEXT: s_mov_b64 exec, s[8:9] -; HAWAII-NEXT: s_mov_b32 s33, s6 -; HAWAII-NEXT: s_setpc_b64 s[4:5] -; -; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v42, s4, 2 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v42, s30, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v42, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v40, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v41 -; GFX9-NEXT: v_mov_b32_e32 v1, v40 -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 -; GFX9-NEXT: v_readlane_b32 s30, v42, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: v_readlane_b32 s6, v42, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[8:9] -; GFX9-NEXT: s_mov_b32 s33, s6 -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_other_call: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v42, s4, 2 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_writelane_b32 v42, s30, 0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v42, s31, 1 +; GCN-NEXT: v_mov_b32_e32 v40, v1 +; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v41 +; GCN-NEXT: v_mov_b32_e32 v1, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 +; GCN-NEXT: v_readlane_b32 s31, v42, 1 +; GCN-NEXT: v_readlane_b32 s30, v42, 0 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: v_readlane_b32 s6, v42, 2 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call) @@ -941,62 +491,24 @@ entry: ; Have stack object in caller and stack passed arguments. SP should be ; in same place at function exit. define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { -; FIJI-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; FIJI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; FIJI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; FIJI-NEXT: v_mov_b32_e32 v34, 9 -; FIJI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; FIJI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; FIJI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_setpc_b64 s[4:5] -; -; HAWAII-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; HAWAII-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; HAWAII-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HAWAII-NEXT: v_mov_b32_e32 v34, 9 -; HAWAII-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; HAWAII-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; HAWAII-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: s_setpc_b64 s[4:5] -; -; GFX9-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v34, 9 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v34, 9 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -1006,143 +518,51 @@ entry: } define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { -; FIJI-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[4:5] -; FIJI-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; FIJI-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; FIJI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; FIJI-NEXT: v_mov_b32_e32 v2, 9 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, 0 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; FIJI-NEXT: v_mov_b32_e32 v2, 0 -; FIJI-NEXT: v_mov_b32_e32 v3, 0 -; FIJI-NEXT: v_mov_b32_e32 v4, 0 -; FIJI-NEXT: v_mov_b32_e32 v5, 0 -; FIJI-NEXT: v_mov_b32_e32 v6, 0 -; FIJI-NEXT: v_mov_b32_e32 v7, 0 -; FIJI-NEXT: v_mov_b32_e32 v8, 0 -; FIJI-NEXT: v_mov_b32_e32 v9, 0 -; FIJI-NEXT: v_mov_b32_e32 v10, 0 -; FIJI-NEXT: v_mov_b32_e32 v11, 0 -; FIJI-NEXT: v_mov_b32_e32 v12, 0 -; FIJI-NEXT: v_mov_b32_e32 v13, 0 -; FIJI-NEXT: v_mov_b32_e32 v14, 0 -; FIJI-NEXT: v_mov_b32_e32 v15, 0 -; FIJI-NEXT: v_mov_b32_e32 v16, 0 -; FIJI-NEXT: v_mov_b32_e32 v17, 0 -; FIJI-NEXT: v_mov_b32_e32 v18, 0 -; FIJI-NEXT: v_mov_b32_e32 v19, 0 -; FIJI-NEXT: v_mov_b32_e32 v20, 0 -; FIJI-NEXT: v_mov_b32_e32 v21, 0 -; FIJI-NEXT: v_mov_b32_e32 v22, 0 -; FIJI-NEXT: v_mov_b32_e32 v23, 0 -; FIJI-NEXT: v_mov_b32_e32 v24, 0 -; FIJI-NEXT: v_mov_b32_e32 v25, 0 -; FIJI-NEXT: v_mov_b32_e32 v26, 0 -; FIJI-NEXT: v_mov_b32_e32 v27, 0 -; FIJI-NEXT: v_mov_b32_e32 v28, 0 -; FIJI-NEXT: v_mov_b32_e32 v29, 0 -; FIJI-NEXT: v_mov_b32_e32 v30, 0 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_setpc_b64 s[4:5] -; -; HAWAII-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[4:5] -; HAWAII-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; HAWAII-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; HAWAII-NEXT: v_mov_b32_e32 v2, 9 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v2, 0 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; HAWAII-NEXT: v_mov_b32_e32 v2, 0 -; HAWAII-NEXT: v_mov_b32_e32 v3, 0 -; HAWAII-NEXT: v_mov_b32_e32 v4, 0 -; HAWAII-NEXT: v_mov_b32_e32 v5, 0 -; HAWAII-NEXT: v_mov_b32_e32 v6, 0 -; HAWAII-NEXT: v_mov_b32_e32 v7, 0 -; HAWAII-NEXT: v_mov_b32_e32 v8, 0 -; HAWAII-NEXT: v_mov_b32_e32 v9, 0 -; HAWAII-NEXT: v_mov_b32_e32 v10, 0 -; HAWAII-NEXT: v_mov_b32_e32 v11, 0 -; HAWAII-NEXT: v_mov_b32_e32 v12, 0 -; HAWAII-NEXT: v_mov_b32_e32 v13, 0 -; HAWAII-NEXT: v_mov_b32_e32 v14, 0 -; HAWAII-NEXT: v_mov_b32_e32 v15, 0 -; HAWAII-NEXT: v_mov_b32_e32 v16, 0 -; HAWAII-NEXT: v_mov_b32_e32 v17, 0 -; HAWAII-NEXT: v_mov_b32_e32 v18, 0 -; HAWAII-NEXT: v_mov_b32_e32 v19, 0 -; HAWAII-NEXT: v_mov_b32_e32 v20, 0 -; HAWAII-NEXT: v_mov_b32_e32 v21, 0 -; HAWAII-NEXT: v_mov_b32_e32 v22, 0 -; HAWAII-NEXT: v_mov_b32_e32 v23, 0 -; HAWAII-NEXT: v_mov_b32_e32 v24, 0 -; HAWAII-NEXT: v_mov_b32_e32 v25, 0 -; HAWAII-NEXT: v_mov_b32_e32 v26, 0 -; HAWAII-NEXT: v_mov_b32_e32 v27, 0 -; HAWAII-NEXT: v_mov_b32_e32 v28, 0 -; HAWAII-NEXT: v_mov_b32_e32 v29, 0 -; HAWAII-NEXT: v_mov_b32_e32 v30, 0 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: s_setpc_b64 s[4:5] -; -; GFX9-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 9 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 -; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mov_b32_e32 v16, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, 0 -; GFX9-NEXT: v_mov_b32_e32 v18, 0 -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_mov_b32_e32 v20, 0 -; GFX9-NEXT: v_mov_b32_e32 v21, 0 -; GFX9-NEXT: v_mov_b32_e32 v22, 0 -; GFX9-NEXT: v_mov_b32_e32 v23, 0 -; GFX9-NEXT: v_mov_b32_e32 v24, 0 -; GFX9-NEXT: v_mov_b32_e32 v25, 0 -; GFX9-NEXT: v_mov_b32_e32 v26, 0 -; GFX9-NEXT: v_mov_b32_e32 v27, 0 -; GFX9-NEXT: v_mov_b32_e32 v28, 0 -; GFX9-NEXT: v_mov_b32_e32 v29, 0 -; GFX9-NEXT: v_mov_b32_e32 v30, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v2, 9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca [16 x i32], align 4, addrspace(5) %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5 @@ -1155,41 +575,17 @@ entry: ; Do support tail calls with a uniform, but unknown, callee. define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { -; FIJI-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[16:17] -; FIJI-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 -; FIJI-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 -; FIJI-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_setpc_b64 s[16:17] -; -; HAWAII-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[16:17] -; HAWAII-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 -; HAWAII-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: s_setpc_b64 s[16:17] -; -; GFX9-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[16:17] +; GCN-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %func.ptr.load = load ptr, ptr addrspace(4) @func_ptr_gv %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b) @@ -1480,77 +876,29 @@ entry: declare hidden void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) align 16, ptr addrspace(5) byval([2 x i64])) define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { -; FIJI-LABEL: sibling_call_fastcc_multi_byval: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, 9 -; FIJI-NEXT: v_mov_b32_e32 v2, 0 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; FIJI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; FIJI-NEXT: s_getpc_b64 s[16:17] -; FIJI-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 -; FIJI-NEXT: s_setpc_b64 s[16:17] -; -; HAWAII-LABEL: sibling_call_fastcc_multi_byval: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, 9 -; HAWAII-NEXT: v_mov_b32_e32 v2, 0 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; HAWAII-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; HAWAII-NEXT: s_getpc_b64 s[16:17] -; HAWAII-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 -; HAWAII-NEXT: s_setpc_b64 s[16:17] -; -; GFX9-LABEL: sibling_call_fastcc_multi_byval: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 -; GFX9-NEXT: s_setpc_b64 s[16:17] +; GCN-LABEL: sibling_call_fastcc_multi_byval: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 9 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %alloca0 = alloca [3 x i32], align 16, addrspace(5) %alloca1 = alloca [2 x i64], align 8, addrspace(5) @@ -1564,152 +912,54 @@ declare hidden void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([ ; Callee has a byval and non-byval stack passed argument define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 { -; FIJI-LABEL: sibling_call_byval_and_stack_passed: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, 9 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; FIJI-NEXT: v_mov_b32_e32 v1, 0 -; FIJI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 -; FIJI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 -; FIJI-NEXT: v_mov_b32_e32 v0, 0 -; FIJI-NEXT: v_mov_b32_e32 v1, 0 -; FIJI-NEXT: v_mov_b32_e32 v2, 0 -; FIJI-NEXT: v_mov_b32_e32 v3, 0 -; FIJI-NEXT: v_mov_b32_e32 v4, 0 -; FIJI-NEXT: v_mov_b32_e32 v5, 0 -; FIJI-NEXT: v_mov_b32_e32 v6, 0 -; FIJI-NEXT: v_mov_b32_e32 v7, 0 -; FIJI-NEXT: v_mov_b32_e32 v8, 0 -; FIJI-NEXT: v_mov_b32_e32 v9, 0 -; FIJI-NEXT: v_mov_b32_e32 v10, 0 -; FIJI-NEXT: v_mov_b32_e32 v11, 0 -; FIJI-NEXT: v_mov_b32_e32 v12, 0 -; FIJI-NEXT: v_mov_b32_e32 v13, 0 -; FIJI-NEXT: v_mov_b32_e32 v14, 0 -; FIJI-NEXT: v_mov_b32_e32 v15, 0 -; FIJI-NEXT: v_mov_b32_e32 v16, 0 -; FIJI-NEXT: v_mov_b32_e32 v17, 0 -; FIJI-NEXT: v_mov_b32_e32 v18, 0 -; FIJI-NEXT: v_mov_b32_e32 v19, 0 -; FIJI-NEXT: v_mov_b32_e32 v20, 0 -; FIJI-NEXT: v_mov_b32_e32 v21, 0 -; FIJI-NEXT: v_mov_b32_e32 v22, 0 -; FIJI-NEXT: v_mov_b32_e32 v23, 0 -; FIJI-NEXT: v_mov_b32_e32 v24, 0 -; FIJI-NEXT: v_mov_b32_e32 v25, 0 -; FIJI-NEXT: v_mov_b32_e32 v26, 0 -; FIJI-NEXT: v_mov_b32_e32 v27, 0 -; FIJI-NEXT: v_mov_b32_e32 v28, 0 -; FIJI-NEXT: v_mov_b32_e32 v29, 0 -; FIJI-NEXT: v_mov_b32_e32 v30, 0 -; FIJI-NEXT: s_getpc_b64 s[16:17] -; FIJI-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 -; FIJI-NEXT: s_setpc_b64 s[16:17] -; -; HAWAII-LABEL: sibling_call_byval_and_stack_passed: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, 9 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; HAWAII-NEXT: v_mov_b32_e32 v1, 0 -; HAWAII-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 -; HAWAII-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 -; HAWAII-NEXT: v_mov_b32_e32 v0, 0 -; HAWAII-NEXT: v_mov_b32_e32 v1, 0 -; HAWAII-NEXT: v_mov_b32_e32 v2, 0 -; HAWAII-NEXT: v_mov_b32_e32 v3, 0 -; HAWAII-NEXT: v_mov_b32_e32 v4, 0 -; HAWAII-NEXT: v_mov_b32_e32 v5, 0 -; HAWAII-NEXT: v_mov_b32_e32 v6, 0 -; HAWAII-NEXT: v_mov_b32_e32 v7, 0 -; HAWAII-NEXT: v_mov_b32_e32 v8, 0 -; HAWAII-NEXT: v_mov_b32_e32 v9, 0 -; HAWAII-NEXT: v_mov_b32_e32 v10, 0 -; HAWAII-NEXT: v_mov_b32_e32 v11, 0 -; HAWAII-NEXT: v_mov_b32_e32 v12, 0 -; HAWAII-NEXT: v_mov_b32_e32 v13, 0 -; HAWAII-NEXT: v_mov_b32_e32 v14, 0 -; HAWAII-NEXT: v_mov_b32_e32 v15, 0 -; HAWAII-NEXT: v_mov_b32_e32 v16, 0 -; HAWAII-NEXT: v_mov_b32_e32 v17, 0 -; HAWAII-NEXT: v_mov_b32_e32 v18, 0 -; HAWAII-NEXT: v_mov_b32_e32 v19, 0 -; HAWAII-NEXT: v_mov_b32_e32 v20, 0 -; HAWAII-NEXT: v_mov_b32_e32 v21, 0 -; HAWAII-NEXT: v_mov_b32_e32 v22, 0 -; HAWAII-NEXT: v_mov_b32_e32 v23, 0 -; HAWAII-NEXT: v_mov_b32_e32 v24, 0 -; HAWAII-NEXT: v_mov_b32_e32 v25, 0 -; HAWAII-NEXT: v_mov_b32_e32 v26, 0 -; HAWAII-NEXT: v_mov_b32_e32 v27, 0 -; HAWAII-NEXT: v_mov_b32_e32 v28, 0 -; HAWAII-NEXT: v_mov_b32_e32 v29, 0 -; HAWAII-NEXT: v_mov_b32_e32 v30, 0 -; HAWAII-NEXT: s_getpc_b64 s[16:17] -; HAWAII-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 -; HAWAII-NEXT: s_setpc_b64 s[16:17] -; -; GFX9-LABEL: sibling_call_byval_and_stack_passed: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 -; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_mov_b32_e32 v16, 0 -; GFX9-NEXT: v_mov_b32_e32 v17, 0 -; GFX9-NEXT: v_mov_b32_e32 v18, 0 -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_mov_b32_e32 v20, 0 -; GFX9-NEXT: v_mov_b32_e32 v21, 0 -; GFX9-NEXT: v_mov_b32_e32 v22, 0 -; GFX9-NEXT: v_mov_b32_e32 v23, 0 -; GFX9-NEXT: v_mov_b32_e32 v24, 0 -; GFX9-NEXT: v_mov_b32_e32 v25, 0 -; GFX9-NEXT: v_mov_b32_e32 v26, 0 -; GFX9-NEXT: v_mov_b32_e32 v27, 0 -; GFX9-NEXT: v_mov_b32_e32 v28, 0 -; GFX9-NEXT: v_mov_b32_e32 v29, 0 -; GFX9-NEXT: v_mov_b32_e32 v30, 0 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 -; GFX9-NEXT: s_setpc_b64 s[16:17] +; GCN-LABEL: sibling_call_byval_and_stack_passed: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NEXT: v_mov_b32_e32 v22, 0 +; GCN-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NEXT: v_mov_b32_e32 v25, 0 +; GCN-NEXT: v_mov_b32_e32 v26, 0 +; GCN-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NEXT: v_mov_b32_e32 v29, 0 +; GCN-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %alloca = alloca [3 x i32], align 16, addrspace(5) store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca @@ -1720,29 +970,13 @@ entry: declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0) define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 { -; FIJI-LABEL: sibling_call_i64_fastcc_i64: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[16:17] -; FIJI-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 -; FIJI-NEXT: s_setpc_b64 s[16:17] -; -; HAWAII-LABEL: sibling_call_i64_fastcc_i64: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[16:17] -; HAWAII-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 -; HAWAII-NEXT: s_setpc_b64 s[16:17] -; -; GFX9-LABEL: sibling_call_i64_fastcc_i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 -; GFX9-NEXT: s_setpc_b64 s[16:17] +; GCN-LABEL: sibling_call_i64_fastcc_i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a) ret i64 %ret @@ -1751,29 +985,13 @@ entry: declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0) define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 { -; FIJI-LABEL: sibling_call_p1i8_fastcc_p1i8: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[16:17] -; FIJI-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 -; FIJI-NEXT: s_setpc_b64 s[16:17] -; -; HAWAII-LABEL: sibling_call_p1i8_fastcc_p1i8: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[16:17] -; HAWAII-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 -; HAWAII-NEXT: s_setpc_b64 s[16:17] -; -; GFX9-LABEL: sibling_call_p1i8_fastcc_p1i8: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 -; GFX9-NEXT: s_setpc_b64 s[16:17] +; GCN-LABEL: sibling_call_p1i8_fastcc_p1i8: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a) ret ptr addrspace(1) %ret @@ -1782,29 +1000,13 @@ entry: declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0) define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 { -; FIJI-LABEL: sibling_call_i16_fastcc_i16: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[16:17] -; FIJI-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 -; FIJI-NEXT: s_setpc_b64 s[16:17] -; -; HAWAII-LABEL: sibling_call_i16_fastcc_i16: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[16:17] -; HAWAII-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 -; HAWAII-NEXT: s_setpc_b64 s[16:17] -; -; GFX9-LABEL: sibling_call_i16_fastcc_i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 -; GFX9-NEXT: s_setpc_b64 s[16:17] +; GCN-LABEL: sibling_call_i16_fastcc_i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a) ret i16 %ret @@ -1813,29 +1015,13 @@ entry: declare hidden fastcc half @f16_fastcc_f16(half %arg0) define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 { -; FIJI-LABEL: sibling_call_f16_fastcc_f16: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[16:17] -; FIJI-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 -; FIJI-NEXT: s_setpc_b64 s[16:17] -; -; HAWAII-LABEL: sibling_call_f16_fastcc_f16: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[16:17] -; HAWAII-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 -; HAWAII-NEXT: s_setpc_b64 s[16:17] -; -; GFX9-LABEL: sibling_call_f16_fastcc_f16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 -; GFX9-NEXT: s_setpc_b64 s[16:17] +; GCN-LABEL: sibling_call_f16_fastcc_f16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc half @f16_fastcc_f16(half %a) ret half %ret @@ -1844,29 +1030,13 @@ entry: declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0) define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 { -; FIJI-LABEL: sibling_call_v3i16_fastcc_v3i16: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[16:17] -; FIJI-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 -; FIJI-NEXT: s_setpc_b64 s[16:17] -; -; HAWAII-LABEL: sibling_call_v3i16_fastcc_v3i16: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[16:17] -; HAWAII-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 -; HAWAII-NEXT: s_setpc_b64 s[16:17] -; -; GFX9-LABEL: sibling_call_v3i16_fastcc_v3i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 -; GFX9-NEXT: s_setpc_b64 s[16:17] +; GCN-LABEL: sibling_call_v3i16_fastcc_v3i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a) ret <3 x i16> %ret @@ -1875,29 +1045,13 @@ entry: declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0) define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 { -; FIJI-LABEL: sibling_call_v4i16_fastcc_v4i16: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[16:17] -; FIJI-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 -; FIJI-NEXT: s_setpc_b64 s[16:17] -; -; HAWAII-LABEL: sibling_call_v4i16_fastcc_v4i16: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[16:17] -; HAWAII-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 -; HAWAII-NEXT: s_setpc_b64 s[16:17] -; -; GFX9-LABEL: sibling_call_v4i16_fastcc_v4i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 -; GFX9-NEXT: s_setpc_b64 s[16:17] +; GCN-LABEL: sibling_call_v4i16_fastcc_v4i16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a) ret <4 x i16> %ret @@ -1906,29 +1060,13 @@ entry: declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0) define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 { -; FIJI-LABEL: sibling_call_v2i64_fastcc_v2i64: -; FIJI: ; %bb.0: ; %entry -; FIJI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIJI-NEXT: s_getpc_b64 s[16:17] -; FIJI-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 -; FIJI-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 -; FIJI-NEXT: s_setpc_b64 s[16:17] -; -; HAWAII-LABEL: sibling_call_v2i64_fastcc_v2i64: -; HAWAII: ; %bb.0: ; %entry -; HAWAII-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; HAWAII-NEXT: s_getpc_b64 s[16:17] -; HAWAII-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 -; HAWAII-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 -; HAWAII-NEXT: s_setpc_b64 s[16:17] -; -; GFX9-LABEL: sibling_call_v2i64_fastcc_v2i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 -; GFX9-NEXT: s_setpc_b64 s[16:17] +; GCN-LABEL: sibling_call_v2i64_fastcc_v2i64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12 +; GCN-NEXT: s_setpc_b64 s[16:17] entry: %ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a) ret <2 x i64> %ret