|  | 
|  | 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
|  | 2 | +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s | 
|  | 3 | +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s | 
|  | 4 | + | 
|  | 5 | +; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG | 
|  | 6 | +; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable | 
|  | 7 | +; similar transformations in that pass. | 
|  | 8 | + | 
|  | 9 | +; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use. | 
|  | 10 | +define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { | 
|  | 11 | +; GFX942_PTRADD-LABEL: global_load_ZTwoUses: | 
|  | 12 | +; GFX942_PTRADD:       ; %bb.0: | 
|  | 13 | +; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | 14 | +; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 24 | 
|  | 15 | +; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] | 
|  | 16 | +; GFX942_PTRADD-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off | 
|  | 17 | +; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) | 
|  | 18 | +; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] | 
|  | 19 | +; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31] | 
|  | 20 | +; | 
|  | 21 | +; GFX942_LEGACY-LABEL: global_load_ZTwoUses: | 
|  | 22 | +; GFX942_LEGACY:       ; %bb.0: | 
|  | 23 | +; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | 24 | +; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] | 
|  | 25 | +; GFX942_LEGACY-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off offset:24 | 
|  | 26 | +; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) | 
|  | 27 | +; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] | 
|  | 28 | +; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31] | 
|  | 29 | +  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24 | 
|  | 30 | +  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset | 
|  | 31 | +  %l = load i64, ptr addrspace(1) %gep1, align 8 | 
|  | 32 | +  %r = add i64 %l, %voffset | 
|  | 33 | +  ret i64 %r | 
|  | 34 | +} | 
|  | 35 | + | 
|  | 36 | +define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { | 
|  | 37 | +; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: | 
|  | 38 | +; GFX942_PTRADD:       ; %bb.0: | 
|  | 39 | +; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | 40 | +; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, 24 | 
|  | 41 | +; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] | 
|  | 42 | +; GFX942_PTRADD-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off | 
|  | 43 | +; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) | 
|  | 44 | +; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31] | 
|  | 45 | +; | 
|  | 46 | +; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: | 
|  | 47 | +; GFX942_LEGACY:       ; %bb.0: | 
|  | 48 | +; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | 49 | +; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] | 
|  | 50 | +; GFX942_LEGACY-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off offset:24 | 
|  | 51 | +; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) | 
|  | 52 | +; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31] | 
|  | 53 | +  %add0 = add nuw nsw i64 %voffset, 24 | 
|  | 54 | +  %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 | 
|  | 55 | +  %l = load i64, ptr addrspace(1) %gep0, align 8 | 
|  | 56 | +  ret i64 %l | 
|  | 57 | +} | 
|  | 58 | + | 
|  | 59 | +; Tests reassociation (ptradd (ptradd p, c1), c2) with two constants. These | 
|  | 60 | +; would be folded away in most cases, but the index computation introduced by | 
|  | 61 | +; the legalization of wide vector stores can for example introduce them. | 
|  | 62 | +define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { | 
|  | 63 | +; GFX942_PTRADD-LABEL: store_v16i32: | 
|  | 64 | +; GFX942_PTRADD:       ; %bb.0: ; %entry | 
|  | 65 | +; GFX942_PTRADD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
|  | 66 | +; GFX942_PTRADD-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40 | 
|  | 67 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v4, 0 | 
|  | 68 | +; GFX942_PTRADD-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | 69 | +; GFX942_PTRADD-NEXT:    s_add_u32 s2, s0, 32 | 
|  | 70 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v0, s20 | 
|  | 71 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v1, s21 | 
|  | 72 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v2, s22 | 
|  | 73 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v3, s23 | 
|  | 74 | +; GFX942_PTRADD-NEXT:    s_addc_u32 s3, s1, 0 | 
|  | 75 | +; GFX942_PTRADD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 | 
|  | 76 | +; GFX942_PTRADD-NEXT:    s_nop 1 | 
|  | 77 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v0, s16 | 
|  | 78 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v1, s17 | 
|  | 79 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v2, s18 | 
|  | 80 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v3, s19 | 
|  | 81 | +; GFX942_PTRADD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 | 
|  | 82 | +; GFX942_PTRADD-NEXT:    s_nop 1 | 
|  | 83 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v0, s12 | 
|  | 84 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v1, s13 | 
|  | 85 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v2, s14 | 
|  | 86 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v3, s15 | 
|  | 87 | +; GFX942_PTRADD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 | 
|  | 88 | +; GFX942_PTRADD-NEXT:    s_nop 1 | 
|  | 89 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v0, s8 | 
|  | 90 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v1, s9 | 
|  | 91 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v2, s10 | 
|  | 92 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v3, s11 | 
|  | 93 | +; GFX942_PTRADD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] | 
|  | 94 | +; GFX942_PTRADD-NEXT:    s_endpgm | 
|  | 95 | +; | 
|  | 96 | +; GFX942_LEGACY-LABEL: store_v16i32: | 
|  | 97 | +; GFX942_LEGACY:       ; %bb.0: ; %entry | 
|  | 98 | +; GFX942_LEGACY-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40 | 
|  | 99 | +; GFX942_LEGACY-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
|  | 100 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v4, 0 | 
|  | 101 | +; GFX942_LEGACY-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | 102 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v0, s20 | 
|  | 103 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v1, s21 | 
|  | 104 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v2, s22 | 
|  | 105 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v3, s23 | 
|  | 106 | +; GFX942_LEGACY-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 | 
|  | 107 | +; GFX942_LEGACY-NEXT:    s_nop 1 | 
|  | 108 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v0, s16 | 
|  | 109 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v1, s17 | 
|  | 110 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v2, s18 | 
|  | 111 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v3, s19 | 
|  | 112 | +; GFX942_LEGACY-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 | 
|  | 113 | +; GFX942_LEGACY-NEXT:    s_nop 1 | 
|  | 114 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v0, s12 | 
|  | 115 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v1, s13 | 
|  | 116 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v2, s14 | 
|  | 117 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v3, s15 | 
|  | 118 | +; GFX942_LEGACY-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 | 
|  | 119 | +; GFX942_LEGACY-NEXT:    s_nop 1 | 
|  | 120 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v0, s8 | 
|  | 121 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v1, s9 | 
|  | 122 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v2, s10 | 
|  | 123 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v3, s11 | 
|  | 124 | +; GFX942_LEGACY-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] | 
|  | 125 | +; GFX942_LEGACY-NEXT:    s_endpgm | 
|  | 126 | +entry: | 
|  | 127 | +  store <16 x i32> %a, ptr addrspace(1) %out | 
|  | 128 | +  ret void | 
|  | 129 | +} | 
|  | 130 | + | 
|  | 131 | + | 
|  | 132 | +; Tests the (ptradd 0, x) -> x DAG combine. | 
|  | 133 | +define void @baseptr_null(i64 %offset, i8 %v) { | 
|  | 134 | +; GFX942_PTRADD-LABEL: baseptr_null: | 
|  | 135 | +; GFX942_PTRADD:       ; %bb.0: | 
|  | 136 | +; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | 137 | +; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], 0, 0, v[0:1] | 
|  | 138 | +; GFX942_PTRADD-NEXT:    flat_store_byte v[0:1], v2 | 
|  | 139 | +; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
|  | 140 | +; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31] | 
|  | 141 | +; | 
|  | 142 | +; GFX942_LEGACY-LABEL: baseptr_null: | 
|  | 143 | +; GFX942_LEGACY:       ; %bb.0: | 
|  | 144 | +; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | 
|  | 145 | +; GFX942_LEGACY-NEXT:    flat_store_byte v[0:1], v2 | 
|  | 146 | +; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
|  | 147 | +; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31] | 
|  | 148 | +  %gep = getelementptr i8, ptr null, i64 %offset | 
|  | 149 | +  store i8 %v, ptr %gep, align 1 | 
|  | 150 | +  ret void | 
|  | 151 | +} | 
|  | 152 | + | 
|  | 153 | +; Taken from implicit-kernarg-backend-usage.ll, tests the PTRADD handling in the | 
|  | 154 | +; assertalign DAG combine. | 
|  | 155 | +define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  #0 { | 
|  | 156 | +; GFX942_PTRADD-LABEL: llvm_amdgcn_queue_ptr: | 
|  | 157 | +; GFX942_PTRADD:       ; %bb.0: | 
|  | 158 | +; GFX942_PTRADD-NEXT:    s_add_u32 s8, s4, 8 | 
|  | 159 | +; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v2, 0 | 
|  | 160 | +; GFX942_PTRADD-NEXT:    s_addc_u32 s9, s5, 0 | 
|  | 161 | +; GFX942_PTRADD-NEXT:    global_load_ubyte v0, v2, s[2:3] sc0 sc1 | 
|  | 162 | +; GFX942_PTRADD-NEXT:    global_load_ubyte v0, v2, s[8:9] sc0 sc1 | 
|  | 163 | +; GFX942_PTRADD-NEXT:    global_load_ubyte v0, v2, s[0:1] sc0 sc1 | 
|  | 164 | +; GFX942_PTRADD-NEXT:    ; kill: killed $sgpr0_sgpr1 | 
|  | 165 | +; GFX942_PTRADD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
|  | 166 | +; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) | 
|  | 167 | +; GFX942_PTRADD-NEXT:    v_mov_b64_e32 v[0:1], s[6:7] | 
|  | 168 | +; GFX942_PTRADD-NEXT:    ; kill: killed $sgpr8 killed $sgpr9 | 
|  | 169 | +; GFX942_PTRADD-NEXT:    ; kill: killed $sgpr2_sgpr3 | 
|  | 170 | +; GFX942_PTRADD-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | 171 | +; GFX942_PTRADD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 | 
|  | 172 | +; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) | 
|  | 173 | +; GFX942_PTRADD-NEXT:    s_endpgm | 
|  | 174 | +; | 
|  | 175 | +; GFX942_LEGACY-LABEL: llvm_amdgcn_queue_ptr: | 
|  | 176 | +; GFX942_LEGACY:       ; %bb.0: | 
|  | 177 | +; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v2, 0 | 
|  | 178 | +; GFX942_LEGACY-NEXT:    global_load_ubyte v0, v2, s[2:3] sc0 sc1 | 
|  | 179 | +; GFX942_LEGACY-NEXT:    global_load_ubyte v0, v2, s[4:5] offset:8 sc0 sc1 | 
|  | 180 | +; GFX942_LEGACY-NEXT:    global_load_ubyte v0, v2, s[0:1] sc0 sc1 | 
|  | 181 | +; GFX942_LEGACY-NEXT:    ; kill: killed $sgpr0_sgpr1 | 
|  | 182 | +; GFX942_LEGACY-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
|  | 183 | +; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) | 
|  | 184 | +; GFX942_LEGACY-NEXT:    v_mov_b64_e32 v[0:1], s[6:7] | 
|  | 185 | +; GFX942_LEGACY-NEXT:    ; kill: killed $sgpr2_sgpr3 | 
|  | 186 | +; GFX942_LEGACY-NEXT:    s_waitcnt lgkmcnt(0) | 
|  | 187 | +; GFX942_LEGACY-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 | 
|  | 188 | +; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) | 
|  | 189 | +; GFX942_LEGACY-NEXT:    s_endpgm | 
|  | 190 | +  %queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() | 
|  | 191 | +  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | 
|  | 192 | +  %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() | 
|  | 193 | +  %dispatch.id = call i64 @llvm.amdgcn.dispatch.id() | 
|  | 194 | +  %queue.load = load volatile i8, ptr addrspace(4) %queue.ptr | 
|  | 195 | +  %implicitarg.load = load volatile i8, ptr addrspace(4) %implicitarg.ptr | 
|  | 196 | +  %dispatch.load = load volatile i8, ptr addrspace(4) %dispatch.ptr | 
|  | 197 | +  store volatile i64 %dispatch.id, ptr addrspace(1) %ptr | 
|  | 198 | +  ret void | 
|  | 199 | +} | 
|  | 200 | + | 
|  | 201 | +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: | 
|  | 202 | +; GFX942: {{.*}} | 
0 commit comments