22
33; indexing of vectors.
44
5- ; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
6- ; to avoid gfx9 scheduling induced issues.
7-
8-
9- ; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
10- ; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]]
11- ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
12- ; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
13-
14- ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
15- ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
16-
17- ; GCN: v_cmp_eq_u32_e32
18- ; GCN-COUNT-32: v_cndmask_b32
19-
20- ; GCN-COUNT-4: buffer_store_dwordx4
21- define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block (ptr addrspace (1 ) %out0 , ptr addrspace (1 ) %out1 , ptr addrspace (1 ) %in , <16 x i32 > %vec0 ) #0 {
22- entry:
23- %id = call i32 @llvm.amdgcn.workitem.id.x () #1
24- %id.ext = zext i32 %id to i64
25- %gep = getelementptr inbounds i32 , ptr addrspace (1 ) %in , i64 %id.ext
26- %idx0 = load volatile i32 , ptr addrspace (1 ) %gep
27- %idx1 = add i32 %idx0 , 1
28- %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62" , "=v" ()
29- %vec1 = insertelement <16 x i32 > %vec0 , i32 %live.out.val , i32 %idx0
30- %vec2 = insertelement <16 x i32 > %vec1 , i32 63 , i32 %idx1
31- store volatile <16 x i32 > %vec2 , ptr addrspace (1 ) %out0
32- %cmp = icmp eq i32 %id , 0
33- br i1 %cmp , label %bb1 , label %bb2
34-
35- bb1:
36- store volatile i32 %live.out.val , ptr addrspace (1 ) undef
37- br label %bb2
38-
39- bb2:
40- ret void
41- }
42-
43- ; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The
44- ; gpr_idx mode switching sequence is expanded late for this reason.
45-
46- ; GCN-LABEL: {{^}}insert_w_offset_multiple_in_block
47-
48- ; GCN: s_set_gpr_idx_on
49- ; GCN-NEXT: v_mov_b32_e32
50- ; GCN-NEXT: s_set_gpr_idx_off
51-
52- ; GCN: s_set_gpr_idx_on
53- ; GCN-NEXT: v_mov_b32_e32
54- ; GCN-NOT: v_mov_b32_e32
55- ; GCN-NEXT: s_set_gpr_idx_off
56- define amdgpu_kernel void @insert_w_offset_multiple_in_block (ptr addrspace (1 ) %out1 , i32 %in ) #0 {
57- entry:
58- %add1 = add i32 %in , 1
59- %ins1 = insertelement <16 x float > <float 1 .0 , float 2 .0 , float 3 .0 , float 4 .0 , float 5 .0 , float 6 .0 , float 7 .0 , float 8 .0 , float 9 .0 , float 10 .0 , float 11 .0 , float 12 .0 , float 13 .0 , float 14 .0 , float 15 .0 , float 16 .0 >, float 17 .0 , i32 %add1
60- %add2 = add i32 %in , 2
61- %ins2 = insertelement <16 x float > %ins1 , float 17 .0 , i32 %add2
62- store <16 x float > %ins1 , ptr addrspace (1 ) %out1
63- %out2 = getelementptr <16 x float >, ptr addrspace (1 ) %out1 , i32 1
64- store <16 x float > %ins2 , ptr addrspace (1 ) %out2
65-
66- ret void
67- }
68-
695declare hidden void @foo ()
706
717; For functions with calls, we were not accounting for m0_lo16/m0_hi16
@@ -83,7 +19,4 @@ define amdgpu_kernel void @insertelement_with_call(ptr addrspace(1) %ptr, i32 %i
8319 ret void
8420}
8521
86- declare i32 @llvm.amdgcn.workitem.id.x () #1
87- declare void @llvm.amdgcn.s.barrier () #2
88-
8922attributes #0 = { nounwind }
0 commit comments