|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| 2 | +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN,GFX9 %s |
| 3 | + |
| 4 | +; Make sure we use the correct frame offset is used with the local |
| 5 | +; frame area. |
| 6 | +; |
| 7 | +; %pin.low is allocated to offset 0. |
| 8 | +; |
| 9 | +; %local.area is assigned to the local frame offset by the |
| 10 | +; LocalStackSlotAllocation pass at offset 4096. |
| 11 | +; |
| 12 | +; The %load1 access to %gep.large.offset initially used the stack |
| 13 | +; pointer register and directly referenced the frame index. After |
| 14 | +; LocalStackSlotAllocation, it would no longer refer to a frame index |
| 15 | +; so eliminateFrameIndex would not adjust the access to use the |
| 16 | +; correct FP offset. |
| 17 | + |
| 18 | +define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspace(1)* %in) { |
| 19 | +; GCN-LABEL: local_stack_offset_uses_sp: |
| 20 | +; GCN: ; %bb.0: ; %entry |
| 21 | +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| 22 | +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 |
| 23 | +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 |
| 24 | +; GCN-NEXT: s_add_u32 s0, s0, s9 |
| 25 | +; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 |
| 26 | +; GCN-NEXT: s_addc_u32 s1, s1, 0 |
| 27 | +; GCN-NEXT: v_add_u32_e32 v0, 64, v1 |
| 28 | +; GCN-NEXT: v_mov_b32_e32 v2, 0 |
| 29 | +; GCN-NEXT: v_mov_b32_e32 v3, 0x2000 |
| 30 | +; GCN-NEXT: s_mov_b32 s6, 0 |
| 31 | +; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen |
| 32 | +; GCN-NEXT: BB0_1: ; %loadstoreloop |
| 33 | +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 |
| 34 | +; GCN-NEXT: v_add_u32_e32 v3, s6, v1 |
| 35 | +; GCN-NEXT: s_add_i32 s6, s6, 1 |
| 36 | +; GCN-NEXT: s_cmpk_lt_u32 s6, 0x2120 |
| 37 | +; GCN-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen |
| 38 | +; GCN-NEXT: s_cbranch_scc1 BB0_1 |
| 39 | +; GCN-NEXT: ; %bb.2: ; %split |
| 40 | +; GCN-NEXT: v_mov_b32_e32 v1, 0x3000 |
| 41 | +; GCN-NEXT: v_add_u32_e32 v1, 0x20d0, v1 |
| 42 | +; GCN-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen |
| 43 | +; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 |
| 44 | +; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], s32 offen |
| 45 | +; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], s32 offen offset:4 |
| 46 | +; GCN-NEXT: s_waitcnt vmcnt(1) |
| 47 | +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 |
| 48 | +; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| 49 | +; GCN-NEXT: v_mov_b32_e32 v2, s4 |
| 50 | +; GCN-NEXT: s_waitcnt vmcnt(0) |
| 51 | +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc |
| 52 | +; GCN-NEXT: v_mov_b32_e32 v3, s5 |
| 53 | +; GCN-NEXT: global_store_dwordx2 v[2:3], v[0:1], off |
| 54 | +; GCN-NEXT: s_endpgm |
| 55 | +entry: |
| 56 | + %pin.low = alloca i32, align 8192, addrspace(5) |
| 57 | + %local.area = alloca [1060 x i64], align 4096, addrspace(5) |
| 58 | + store volatile i32 0, i32 addrspace(5)* %pin.low |
| 59 | + %local.area.cast = bitcast [1060 x i64] addrspace(5)* %local.area to i8 addrspace(5)* |
| 60 | + call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %local.area.cast, i8 0, i32 8480, i1 true) |
| 61 | + %gep.large.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 1050 |
| 62 | + %gep.small.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 8 |
| 63 | + %load0 = load volatile i64, i64 addrspace(5)* %gep.large.offset |
| 64 | + %load1 = load volatile i64, i64 addrspace(5)* %gep.small.offset |
| 65 | + %add0 = add i64 %load0, %load1 |
| 66 | + store volatile i64 %add0, i64 addrspace(1)* %out |
| 67 | + ret void |
| 68 | +} |
| 69 | + |
| 70 | +define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspace(1)* %in) { |
| 71 | +; GCN-LABEL: func_local_stack_offset_uses_sp: |
| 72 | +; GCN: ; %bb.0: ; %entry |
| 73 | +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| 74 | +; GCN-NEXT: s_add_u32 s4, s32, 0x7ffc0 |
| 75 | +; GCN-NEXT: s_mov_b32 s5, s33 |
| 76 | +; GCN-NEXT: s_and_b32 s33, s4, 0xfff80000 |
| 77 | +; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 |
| 78 | +; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 |
| 79 | +; GCN-NEXT: v_mov_b32_e32 v4, 0 |
| 80 | +; GCN-NEXT: v_add_u32_e32 v2, 64, v3 |
| 81 | +; GCN-NEXT: s_mov_b32 s4, 0 |
| 82 | +; GCN-NEXT: s_add_u32 s32, s32, 0x180000 |
| 83 | +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 |
| 84 | +; GCN-NEXT: BB1_1: ; %loadstoreloop |
| 85 | +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 |
| 86 | +; GCN-NEXT: v_add_u32_e32 v5, s4, v3 |
| 87 | +; GCN-NEXT: s_add_i32 s4, s4, 1 |
| 88 | +; GCN-NEXT: s_cmpk_lt_u32 s4, 0x2120 |
| 89 | +; GCN-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen |
| 90 | +; GCN-NEXT: s_cbranch_scc1 BB1_1 |
| 91 | +; GCN-NEXT: ; %bb.2: ; %split |
| 92 | +; GCN-NEXT: v_lshrrev_b32_e64 v3, 6, s33 |
| 93 | +; GCN-NEXT: v_add_u32_e32 v3, 0x1000, v3 |
| 94 | +; GCN-NEXT: v_add_u32_e32 v3, 0x20d0, v3 |
| 95 | +; GCN-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen |
| 96 | +; GCN-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 |
| 97 | +; GCN-NEXT: buffer_load_dword v5, v2, s[0:3], s32 offen |
| 98 | +; GCN-NEXT: buffer_load_dword v6, v2, s[0:3], s32 offen offset:4 |
| 99 | +; GCN-NEXT: s_sub_u32 s32, s32, 0x180000 |
| 100 | +; GCN-NEXT: s_mov_b32 s33, s5 |
| 101 | +; GCN-NEXT: s_waitcnt vmcnt(1) |
| 102 | +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5 |
| 103 | +; GCN-NEXT: s_waitcnt vmcnt(0) |
| 104 | +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc |
| 105 | +; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off |
| 106 | +; GCN-NEXT: s_waitcnt vmcnt(0) |
| 107 | +; GCN-NEXT: s_setpc_b64 s[30:31] |
| 108 | +entry: |
| 109 | + %pin.low = alloca i32, align 8192, addrspace(5) |
| 110 | + %local.area = alloca [1060 x i64], align 4096, addrspace(5) |
| 111 | + store volatile i32 0, i32 addrspace(5)* %pin.low |
| 112 | + %local.area.cast = bitcast [1060 x i64] addrspace(5)* %local.area to i8 addrspace(5)* |
| 113 | + call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %local.area.cast, i8 0, i32 8480, i1 true) |
| 114 | + %gep.large.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 1050 |
| 115 | + %gep.small.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 8 |
| 116 | + %load0 = load volatile i64, i64 addrspace(5)* %gep.large.offset |
| 117 | + %load1 = load volatile i64, i64 addrspace(5)* %gep.small.offset |
| 118 | + %add0 = add i64 %load0, %load1 |
| 119 | + store volatile i64 %add0, i64 addrspace(1)* %out |
| 120 | + ret void |
| 121 | +} |
| 122 | + |
| 123 | +declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture writeonly, i8, i32, i1 immarg) #0 |
| 124 | + |
| 125 | +attributes #0 = { argmemonly nounwind willreturn writeonly } |
0 commit comments