1- ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN,GFX9
2- ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GCN,GFX10
1+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX9
3+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GFX10
34
45@lds.0 = internal addrspace (3 ) global [64 x float ] poison, align 16
56@lds.1 = internal addrspace (3 ) global [64 x float ] poison, align 16
1516declare void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) nocapture , i32 %size , i32 %voffset , i32 %soffset , i32 %offset , i32 %aux )
1617declare void @llvm.amdgcn.global.load.lds (ptr addrspace (1 ) nocapture %gptr , ptr addrspace (3 ) nocapture %lptr , i32 %size , i32 %offset , i32 %aux )
1718
18- ; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays:
19- ; GCN-COUNT-4: buffer_load_dword
20- ; GCN: s_waitcnt vmcnt(2)
21- ; GCN: ds_read_b32
22- ; GCN: s_waitcnt vmcnt(0)
23- ; GCN: ds_read_b32
2419define amdgpu_kernel void @buffer_load_lds_dword_2_arrays (<4 x i32 > %rsrc , i32 %i1 , i32 %i2 , ptr addrspace (1 ) %out ) {
20+ ; GFX9-LABEL: buffer_load_lds_dword_2_arrays:
21+ ; GFX9: ; %bb.0: ; %main_body
22+ ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
23+ ; GFX9-NEXT: v_mov_b32_e32 v0, 4
24+ ; GFX9-NEXT: s_mov_b32 m0, 0
25+ ; GFX9-NEXT: v_mov_b32_e32 v1, 8
26+ ; GFX9-NEXT: v_mov_b32_e32 v2, 0
27+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
28+ ; GFX9-NEXT: buffer_load_dword off, s[8:11], 0 lds
29+ ; GFX9-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
30+ ; GFX9-NEXT: s_movk_i32 m0, 0x100
31+ ; GFX9-NEXT: v_mov_b32_e32 v0, 12
32+ ; GFX9-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
33+ ; GFX9-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
34+ ; GFX9-NEXT: s_lshl_b32 s0, s12, 2
35+ ; GFX9-NEXT: s_lshl_b32 s1, s13, 2
36+ ; GFX9-NEXT: v_mov_b32_e32 v0, s0
37+ ; GFX9-NEXT: v_mov_b32_e32 v1, s1
38+ ; GFX9-NEXT: s_waitcnt vmcnt(2)
39+ ; GFX9-NEXT: ds_read_b32 v0, v0
40+ ; GFX9-NEXT: ; wave barrier
41+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
42+ ; GFX9-NEXT: ds_read_b32 v1, v1 offset:256
43+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
44+ ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
45+ ; GFX9-NEXT: s_endpgm
46+ ;
47+ ; GFX10-LABEL: buffer_load_lds_dword_2_arrays:
48+ ; GFX10: ; %bb.0: ; %main_body
49+ ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
50+ ; GFX10-NEXT: v_mov_b32_e32 v0, 4
51+ ; GFX10-NEXT: v_mov_b32_e32 v1, 8
52+ ; GFX10-NEXT: v_mov_b32_e32 v2, 12
53+ ; GFX10-NEXT: s_mov_b32 m0, 0
54+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
55+ ; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
56+ ; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds
57+ ; GFX10-NEXT: s_movk_i32 m0, 0x100
58+ ; GFX10-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
59+ ; GFX10-NEXT: buffer_load_dword v2, s[0:3], 0 offen lds
60+ ; GFX10-NEXT: s_lshl_b32 s0, s4, 2
61+ ; GFX10-NEXT: s_lshl_b32 s1, s5, 2
62+ ; GFX10-NEXT: v_mov_b32_e32 v0, s0
63+ ; GFX10-NEXT: v_mov_b32_e32 v1, s1
64+ ; GFX10-NEXT: v_mov_b32_e32 v2, 0
65+ ; GFX10-NEXT: s_waitcnt vmcnt(2)
66+ ; GFX10-NEXT: ds_read_b32 v0, v0
67+ ; GFX10-NEXT: ; wave barrier
68+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
69+ ; GFX10-NEXT: ds_read_b32 v1, v1 offset:256
70+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
71+ ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
72+ ; GFX10-NEXT: s_endpgm
2573main_body:
2674 call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.0 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
2775 call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.0 , i32 4 , i32 4 , i32 0 , i32 0 , i32 0 )
@@ -41,15 +89,56 @@ main_body:
4189; On gfx9 if there is a pending FLAT operation, and this is a VMem or LGKM
4290; waitcnt and the target can report early completion, then we need to force a waitcnt 0.
4391
44- ; GCN-LABEL: {{^}}global_load_lds_dword_2_arrays:
45- ; GCN-COUNT-4: global_load_dword
46- ; GFX9: s_waitcnt vmcnt(0)
47- ; GFX9-COUNT-2: ds_read_b32
48- ; GFX10: s_waitcnt vmcnt(2)
49- ; GFX10: ds_read_b32
50- ; GFX10: s_waitcnt vmcnt(0)
51- ; GFX10: ds_read_b32
5292define amdgpu_kernel void @global_load_lds_dword_2_arrays (ptr addrspace (1 ) nocapture %gptr , i32 %i1 , i32 %i2 , ptr addrspace (1 ) %out ) {
93+ ; GFX9-LABEL: global_load_lds_dword_2_arrays:
94+ ; GFX9: ; %bb.0: ; %main_body
95+ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
96+ ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
97+ ; GFX9-NEXT: v_mov_b32_e32 v2, 0
98+ ; GFX9-NEXT: s_mov_b32 m0, 0
99+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
100+ ; GFX9-NEXT: global_load_dword v2, s[0:1] lds
101+ ; GFX9-NEXT: global_load_dword v2, s[0:1] offset:4 lds
102+ ; GFX9-NEXT: s_movk_i32 m0, 0x100
103+ ; GFX9-NEXT: s_nop 0
104+ ; GFX9-NEXT: global_load_dword v2, s[0:1] offset:8 lds
105+ ; GFX9-NEXT: global_load_dword v2, s[0:1] offset:12 lds
106+ ; GFX9-NEXT: s_lshl_b32 s0, s2, 2
107+ ; GFX9-NEXT: s_lshl_b32 s1, s3, 2
108+ ; GFX9-NEXT: v_mov_b32_e32 v0, s0
109+ ; GFX9-NEXT: v_mov_b32_e32 v1, s1
110+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
111+ ; GFX9-NEXT: ds_read_b32 v0, v0
112+ ; GFX9-NEXT: ; wave barrier
113+ ; GFX9-NEXT: ds_read_b32 v1, v1 offset:256
114+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
115+ ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
116+ ; GFX9-NEXT: s_endpgm
117+ ;
118+ ; GFX10-LABEL: global_load_lds_dword_2_arrays:
119+ ; GFX10: ; %bb.0: ; %main_body
120+ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
121+ ; GFX10-NEXT: v_mov_b32_e32 v2, 0
122+ ; GFX10-NEXT: s_mov_b32 m0, 0
123+ ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
124+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
125+ ; GFX10-NEXT: global_load_dword v2, s[0:1] lds
126+ ; GFX10-NEXT: global_load_dword v2, s[0:1] offset:4 lds
127+ ; GFX10-NEXT: s_movk_i32 m0, 0x100
128+ ; GFX10-NEXT: global_load_dword v2, s[0:1] offset:8 lds
129+ ; GFX10-NEXT: global_load_dword v2, s[0:1] offset:12 lds
130+ ; GFX10-NEXT: s_lshl_b32 s0, s2, 2
131+ ; GFX10-NEXT: s_lshl_b32 s1, s3, 2
132+ ; GFX10-NEXT: v_mov_b32_e32 v0, s0
133+ ; GFX10-NEXT: v_mov_b32_e32 v1, s1
134+ ; GFX10-NEXT: s_waitcnt vmcnt(2)
135+ ; GFX10-NEXT: ds_read_b32 v0, v0
136+ ; GFX10-NEXT: ; wave barrier
137+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
138+ ; GFX10-NEXT: ds_read_b32 v1, v1 offset:256
139+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
140+ ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
141+ ; GFX10-NEXT: s_endpgm
53142main_body:
54143 call void @llvm.amdgcn.global.load.lds (ptr addrspace (1 ) %gptr , ptr addrspace (3 ) @lds.0 , i32 4 , i32 0 , i32 0 )
55144 call void @llvm.amdgcn.global.load.lds (ptr addrspace (1 ) %gptr , ptr addrspace (3 ) @lds.0 , i32 4 , i32 4 , i32 0 )
@@ -68,25 +157,144 @@ main_body:
68157
69158; There are 8 pseudo registers defined to track LDS DMA dependencies.
70159
71- ; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
72- ; GCN-COUNT-10: buffer_load_dword
73- ; GCN: s_waitcnt vmcnt(8)
74- ; GCN: ds_read_b32
75- ; GCN: s_waitcnt vmcnt(7)
76- ; GCN: ds_read_b32
77- ; GCN: s_waitcnt vmcnt(6)
78- ; GCN: ds_read_b32
79- ; GCN: s_waitcnt vmcnt(5)
80- ; GCN: ds_read_b32
81- ; GCN: s_waitcnt vmcnt(4)
82- ; GCN: ds_read_b32
83- ; GCN: s_waitcnt vmcnt(3)
84- ; GCN: ds_read_b32
85- ; GCN: s_waitcnt vmcnt(2)
86- ; GCN-NOT: s_waitcnt vmcnt
87- ; GCN: ds_read_b32
88- ; GCN: ds_read_b32
89160define amdgpu_kernel void @buffer_load_lds_dword_10_arrays (<4 x i32 > %rsrc , i32 %i1 , i32 %i2 , i32 %i3 , i32 %i4 , i32 %i5 , i32 %i6 , i32 %i7 , i32 %i8 , i32 %i9 , ptr addrspace (1 ) %out ) {
161+ ; GFX9-LABEL: buffer_load_lds_dword_10_arrays:
162+ ; GFX9: ; %bb.0: ; %main_body
163+ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
164+ ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
165+ ; GFX9-NEXT: s_mov_b32 m0, 0
166+ ; GFX9-NEXT: v_mov_b32_e32 v10, 0
167+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
168+ ; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
169+ ; GFX9-NEXT: s_movk_i32 m0, 0x100
170+ ; GFX9-NEXT: s_nop 0
171+ ; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
172+ ; GFX9-NEXT: s_movk_i32 m0, 0x200
173+ ; GFX9-NEXT: s_nop 0
174+ ; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
175+ ; GFX9-NEXT: s_movk_i32 m0, 0x300
176+ ; GFX9-NEXT: s_nop 0
177+ ; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
178+ ; GFX9-NEXT: s_movk_i32 m0, 0x400
179+ ; GFX9-NEXT: s_nop 0
180+ ; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
181+ ; GFX9-NEXT: s_movk_i32 m0, 0x500
182+ ; GFX9-NEXT: s_nop 0
183+ ; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
184+ ; GFX9-NEXT: s_movk_i32 m0, 0x600
185+ ; GFX9-NEXT: s_nop 0
186+ ; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
187+ ; GFX9-NEXT: s_movk_i32 m0, 0x700
188+ ; GFX9-NEXT: s_nop 0
189+ ; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
190+ ; GFX9-NEXT: s_movk_i32 m0, 0x800
191+ ; GFX9-NEXT: s_nop 0
192+ ; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
193+ ; GFX9-NEXT: s_movk_i32 m0, 0x900
194+ ; GFX9-NEXT: s_nop 0
195+ ; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
196+ ; GFX9-NEXT: s_lshl_b32 s2, s6, 2
197+ ; GFX9-NEXT: s_lshl_b32 s3, s7, 2
198+ ; GFX9-NEXT: v_mov_b32_e32 v0, s2
199+ ; GFX9-NEXT: v_mov_b32_e32 v9, s3
200+ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x5c
201+ ; GFX9-NEXT: s_waitcnt vmcnt(9)
202+ ; GFX9-NEXT: ds_read_b32 v0, v0
203+ ; GFX9-NEXT: ; wave barrier
204+ ; GFX9-NEXT: s_waitcnt vmcnt(8)
205+ ; GFX9-NEXT: ds_read_b32 v1, v9 offset:256
206+ ; GFX9-NEXT: ; wave barrier
207+ ; GFX9-NEXT: s_waitcnt vmcnt(7)
208+ ; GFX9-NEXT: ds_read_b32 v2, v9 offset:512
209+ ; GFX9-NEXT: ; wave barrier
210+ ; GFX9-NEXT: s_waitcnt vmcnt(6)
211+ ; GFX9-NEXT: ds_read_b32 v3, v9 offset:768
212+ ; GFX9-NEXT: ; wave barrier
213+ ; GFX9-NEXT: s_waitcnt vmcnt(5)
214+ ; GFX9-NEXT: ds_read_b32 v4, v9 offset:1024
215+ ; GFX9-NEXT: ; wave barrier
216+ ; GFX9-NEXT: s_waitcnt vmcnt(4)
217+ ; GFX9-NEXT: ds_read_b32 v5, v9 offset:1280
218+ ; GFX9-NEXT: ; wave barrier
219+ ; GFX9-NEXT: s_waitcnt vmcnt(3)
220+ ; GFX9-NEXT: ds_read_b32 v6, v9 offset:1536
221+ ; GFX9-NEXT: ; wave barrier
222+ ; GFX9-NEXT: s_waitcnt vmcnt(2)
223+ ; GFX9-NEXT: ds_read_b32 v7, v9 offset:1792
224+ ; GFX9-NEXT: ; wave barrier
225+ ; GFX9-NEXT: ds_read_b32 v8, v9 offset:2048
226+ ; GFX9-NEXT: ; wave barrier
227+ ; GFX9-NEXT: ds_read_b32 v9, v9 offset:2304
228+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
229+ ; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
230+ ; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
231+ ; GFX9-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:32
232+ ; GFX9-NEXT: s_endpgm
233+ ;
234+ ; GFX10-LABEL: buffer_load_lds_dword_10_arrays:
235+ ; GFX10: ; %bb.0: ; %main_body
236+ ; GFX10-NEXT: s_clause 0x1
237+ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
238+ ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
239+ ; GFX10-NEXT: s_mov_b32 m0, 0
240+ ; GFX10-NEXT: v_mov_b32_e32 v10, 0
241+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
242+ ; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
243+ ; GFX10-NEXT: s_movk_i32 m0, 0x100
244+ ; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
245+ ; GFX10-NEXT: s_movk_i32 m0, 0x200
246+ ; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
247+ ; GFX10-NEXT: s_movk_i32 m0, 0x300
248+ ; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
249+ ; GFX10-NEXT: s_movk_i32 m0, 0x400
250+ ; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
251+ ; GFX10-NEXT: s_movk_i32 m0, 0x500
252+ ; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
253+ ; GFX10-NEXT: s_movk_i32 m0, 0x600
254+ ; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
255+ ; GFX10-NEXT: s_movk_i32 m0, 0x700
256+ ; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
257+ ; GFX10-NEXT: s_movk_i32 m0, 0x800
258+ ; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
259+ ; GFX10-NEXT: s_movk_i32 m0, 0x900
260+ ; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
261+ ; GFX10-NEXT: s_lshl_b32 s0, s6, 2
262+ ; GFX10-NEXT: s_lshl_b32 s1, s7, 2
263+ ; GFX10-NEXT: v_mov_b32_e32 v0, s0
264+ ; GFX10-NEXT: v_mov_b32_e32 v9, s1
265+ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x5c
266+ ; GFX10-NEXT: s_waitcnt vmcnt(9)
267+ ; GFX10-NEXT: ds_read_b32 v0, v0
268+ ; GFX10-NEXT: ; wave barrier
269+ ; GFX10-NEXT: s_waitcnt vmcnt(8)
270+ ; GFX10-NEXT: ds_read_b32 v1, v9 offset:256
271+ ; GFX10-NEXT: ; wave barrier
272+ ; GFX10-NEXT: s_waitcnt vmcnt(7)
273+ ; GFX10-NEXT: ds_read_b32 v2, v9 offset:512
274+ ; GFX10-NEXT: ; wave barrier
275+ ; GFX10-NEXT: s_waitcnt vmcnt(6)
276+ ; GFX10-NEXT: ds_read_b32 v3, v9 offset:768
277+ ; GFX10-NEXT: ; wave barrier
278+ ; GFX10-NEXT: s_waitcnt vmcnt(5)
279+ ; GFX10-NEXT: ds_read_b32 v4, v9 offset:1024
280+ ; GFX10-NEXT: ; wave barrier
281+ ; GFX10-NEXT: s_waitcnt vmcnt(4)
282+ ; GFX10-NEXT: ds_read_b32 v5, v9 offset:1280
283+ ; GFX10-NEXT: ; wave barrier
284+ ; GFX10-NEXT: s_waitcnt vmcnt(3)
285+ ; GFX10-NEXT: ds_read_b32 v6, v9 offset:1536
286+ ; GFX10-NEXT: ; wave barrier
287+ ; GFX10-NEXT: s_waitcnt vmcnt(2)
288+ ; GFX10-NEXT: ds_read_b32 v7, v9 offset:1792
289+ ; GFX10-NEXT: ; wave barrier
290+ ; GFX10-NEXT: ds_read_b32 v8, v9 offset:2048
291+ ; GFX10-NEXT: ; wave barrier
292+ ; GFX10-NEXT: ds_read_b32 v9, v9 offset:2304
293+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
294+ ; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
295+ ; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
296+ ; GFX10-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:32
297+ ; GFX10-NEXT: s_endpgm
90298main_body:
91299 call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.0 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
92300 call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.1 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
@@ -151,14 +359,49 @@ main_body:
151359
152360define amdgpu_kernel void @global_load_lds_no_alias_ds_read (ptr addrspace (1 ) nocapture %gptr , i32 %i1 , i32 %i2 , ptr addrspace (1 ) %out ) {
153361; GFX9-LABEL: global_load_lds_no_alias_ds_read:
154- ; GFX9: global_load_dword
155- ; GFX9: global_load_dword
156- ; GFX9: s_waitcnt vmcnt(1)
157- ; GFX9-NOT: s_waitcnt vmcnt(0)
158- ; GFX9: ds_read_b32
159- ; GFX9: s_waitcnt vmcnt(0)
160- ; GFX9: ds_read_b32
161- ; GFX9: s_endpgm
362+ ; GFX9: ; %bb.0: ; %body
363+ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
364+ ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
365+ ; GFX9-NEXT: v_mov_b32_e32 v2, 0
366+ ; GFX9-NEXT: s_mov_b32 m0, 0
367+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
368+ ; GFX9-NEXT: global_load_dword v2, s[0:1] lds
369+ ; GFX9-NEXT: s_movk_i32 m0, 0x100
370+ ; GFX9-NEXT: s_nop 0
371+ ; GFX9-NEXT: global_load_dword v2, s[0:1] offset:4 lds
372+ ; GFX9-NEXT: s_lshl_b32 s0, s2, 2
373+ ; GFX9-NEXT: v_mov_b32_e32 v0, s0
374+ ; GFX9-NEXT: s_lshl_b32 s0, s3, 2
375+ ; GFX9-NEXT: v_mov_b32_e32 v1, s0
376+ ; GFX9-NEXT: s_waitcnt vmcnt(1)
377+ ; GFX9-NEXT: ds_read_b32 v0, v0 offset:512
378+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
379+ ; GFX9-NEXT: ds_read_b32 v1, v1 offset:768
380+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
381+ ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
382+ ; GFX9-NEXT: s_endpgm
383+ ;
384+ ; GFX10-LABEL: global_load_lds_no_alias_ds_read:
385+ ; GFX10: ; %bb.0: ; %body
386+ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
387+ ; GFX10-NEXT: v_mov_b32_e32 v2, 0
388+ ; GFX10-NEXT: s_mov_b32 m0, 0
389+ ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
390+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
391+ ; GFX10-NEXT: global_load_dword v2, s[0:1] lds
392+ ; GFX10-NEXT: s_movk_i32 m0, 0x100
393+ ; GFX10-NEXT: global_load_dword v2, s[0:1] offset:4 lds
394+ ; GFX10-NEXT: s_lshl_b32 s0, s2, 2
395+ ; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
396+ ; GFX10-NEXT: v_mov_b32_e32 v0, s0
397+ ; GFX10-NEXT: s_lshl_b32 s0, s3, 2
398+ ; GFX10-NEXT: v_mov_b32_e32 v1, s0
399+ ; GFX10-NEXT: ds_read_b32 v0, v0 offset:512
400+ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
401+ ; GFX10-NEXT: ds_read_b32 v1, v1 offset:768
402+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
403+ ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
404+ ; GFX10-NEXT: s_endpgm
162405body:
163406 call void @llvm.amdgcn.global.load.lds (ptr addrspace (1 ) %gptr , ptr addrspace (3 ) @lds.0 , i32 4 , i32 0 , i32 0 )
164407 call void @llvm.amdgcn.global.load.lds (ptr addrspace (1 ) %gptr , ptr addrspace (3 ) @lds.1 , i32 4 , i32 4 , i32 0 )
0 commit comments