@@ -207,3 +207,52 @@ define i32 @lshl_add_u64_gep(ptr %p, i64 %a) {
207207 %v = load i32 , ptr %gep
208208 ret i32 %v
209209}
210+
211+ @arr = global [10 x [10 x i64 ]] zeroinitializer
212+ define i64 @lshl_add_u64_gep_shift (i64 %row , i64 %col ) {
213+ ; GCN-LABEL: lshl_add_u64_gep_shift:
214+ ; GCN: ; %bb.0: ; %entry
215+ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
216+ ; GCN-NEXT: s_getpc_b64 s[0:1]
217+ ; GCN-NEXT: s_add_u32 s0, s0, arr@gotpcrel32@lo+4
218+ ; GCN-NEXT: s_addc_u32 s1, s1, arr@gotpcrel32@hi+12
219+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
220+ ; GCN-NEXT: s_movk_i32 s2, 0x50
221+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
222+ ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
223+ ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
224+ ; GCN-NEXT: v_mov_b32_e32 v0, v5
225+ ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, s2, v[0:1]
226+ ; GCN-NEXT: v_mov_b32_e32 v5, v0
227+ ; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[4:5]
228+ ; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
229+ ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
230+ ; GCN-NEXT: s_setpc_b64 s[30:31]
231+ ;
232+ ; GI-LABEL: lshl_add_u64_gep_shift:
233+ ; GI: ; %bb.0: ; %entry
234+ ; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235+ ; GI-NEXT: s_getpc_b64 s[0:1]
236+ ; GI-NEXT: s_add_u32 s0, s0, arr@gotpcrel32@lo+4
237+ ; GI-NEXT: s_addc_u32 s1, s1, arr@gotpcrel32@hi+12
238+ ; GI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
239+ ; GI-NEXT: v_mov_b32_e32 v6, 0x50
240+ ; GI-NEXT: v_mad_u64_u32 v[4:5], s[2:3], v0, v6, 0
241+ ; GI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v1, v6, 0
242+ ; GI-NEXT: v_add_u32_e32 v5, v5, v0
243+ ; GI-NEXT: s_waitcnt lgkmcnt(0)
244+ ; GI-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
245+ ; GI-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
246+ ; GI-NEXT: s_nop 1
247+ ; GI-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
248+ ; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
249+ ; GI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
250+ ; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
251+ ; GI-NEXT: s_setpc_b64 s[30:31]
252+ entry:
253+ %base = getelementptr [10 x [10 x i64 ]], ptr @arr , i64 0 , i64 %row , i64 0
254+ %shifted_col = shl i64 %col , 2 ; multiply by sizeof(i64) (shift left by 2)
255+ %ptr = getelementptr i8 , ptr %base , i64 %shifted_col
256+ %val = load i64 , ptr %ptr
257+ ret i64 %val
258+ }
0 commit comments