@@ -93,6 +93,326 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
9393 ret void
9494}
9595
96+ define amdgpu_kernel void @set_inactive_f32 (ptr addrspace (1 ) %out , float %in ) {
97+ ; GCN-LABEL: set_inactive_f32:
98+ ; GCN: ; %bb.0:
99+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
100+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
101+ ; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
102+ ; GCN-NEXT: s_mov_b32 s2, -1
103+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
104+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
105+ ; GCN-NEXT: s_not_b64 exec, exec
106+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
107+ ; GCN-NEXT: s_not_b64 exec, exec
108+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
109+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
110+ ; GCN-NEXT: s_endpgm
111+ %tmp = call float @llvm.amdgcn.set.inactive.f32 (float %in , float 3 .0 ) #0
112+ store float %tmp , ptr addrspace (1 ) %out
113+ ret void
114+ }
115+
116+ define amdgpu_kernel void @set_inactive_f64 (ptr addrspace (1 ) %out , double %in ) {
117+ ; GCN-LABEL: set_inactive_f64:
118+ ; GCN: ; %bb.0:
119+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
120+ ; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
121+ ; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
122+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
123+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
124+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
125+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
126+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
127+ ; GCN-NEXT: s_not_b64 exec, exec
128+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
129+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
130+ ; GCN-NEXT: s_not_b64 exec, exec
131+ ; GCN-NEXT: s_mov_b32 s2, -1
132+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
133+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
134+ ; GCN-NEXT: s_endpgm
135+ %tmp = call double @llvm.amdgcn.set.inactive.f64 (double %in , double 4 .2 ) #0
136+ store double %tmp , ptr addrspace (1 ) %out
137+ ret void
138+ }
139+
140+ define amdgpu_kernel void @set_inactive_v2i16 (ptr addrspace (1 ) %out , <2 x i16 > %in ) {
141+ ; GCN-LABEL: set_inactive_v2i16:
142+ ; GCN: ; %bb.0:
143+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
144+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
145+ ; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
146+ ; GCN-NEXT: s_mov_b32 s2, -1
147+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
148+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
149+ ; GCN-NEXT: s_not_b64 exec, exec
150+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
151+ ; GCN-NEXT: s_not_b64 exec, exec
152+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
153+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
154+ ; GCN-NEXT: s_endpgm
155+ %tmp = call <2 x i16 > @llvm.amdgcn.set.inactive.v2i16 (<2 x i16 > %in , <2 x i16 > <i16 1 , i16 1 >) #0
156+ store <2 x i16 > %tmp , ptr addrspace (1 ) %out
157+ ret void
158+ }
159+
160+ define amdgpu_kernel void @set_inactive_v2f16 (ptr addrspace (1 ) %out , <2 x half > %in ) {
161+ ; GCN-LABEL: set_inactive_v2f16:
162+ ; GCN: ; %bb.0:
163+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
164+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
165+ ; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
166+ ; GCN-NEXT: s_mov_b32 s2, -1
167+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
168+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
169+ ; GCN-NEXT: s_not_b64 exec, exec
170+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
171+ ; GCN-NEXT: s_not_b64 exec, exec
172+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
173+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
174+ ; GCN-NEXT: s_endpgm
175+ %tmp = call <2 x half > @llvm.amdgcn.set.inactive.v2f16 (<2 x half > %in , <2 x half > <half 1 .0 , half 1 .0 >) #0
176+ store <2 x half > %tmp , ptr addrspace (1 ) %out
177+ ret void
178+ }
179+
180+ define amdgpu_kernel void @set_inactive_v2i32 (ptr addrspace (1 ) %out , <2 x i32 > %in ) {
181+ ; GCN-LABEL: set_inactive_v2i32:
182+ ; GCN: ; %bb.0:
183+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
184+ ; GCN-NEXT: s_mov_b32 s4, 1
185+ ; GCN-NEXT: s_mov_b32 s5, s4
186+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
187+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
188+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
189+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
190+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
191+ ; GCN-NEXT: s_not_b64 exec, exec
192+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
193+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
194+ ; GCN-NEXT: s_not_b64 exec, exec
195+ ; GCN-NEXT: s_mov_b32 s2, -1
196+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
197+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
198+ ; GCN-NEXT: s_endpgm
199+ %tmp = call <2 x i32 > @llvm.amdgcn.set.inactive.v2i32 (<2 x i32 > %in , <2 x i32 > <i32 1 , i32 1 >) #0
200+ store <2 x i32 > %tmp , ptr addrspace (1 ) %out
201+ ret void
202+ }
203+
204+ define amdgpu_kernel void @set_inactive_v2f32 (ptr addrspace (1 ) %out , <2 x float > %in ) {
205+ ; GCN-LABEL: set_inactive_v2f32:
206+ ; GCN: ; %bb.0:
207+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
208+ ; GCN-NEXT: s_mov_b32 s4, 1.0
209+ ; GCN-NEXT: s_mov_b32 s5, s4
210+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
211+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
212+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
213+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
214+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
215+ ; GCN-NEXT: s_not_b64 exec, exec
216+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
217+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
218+ ; GCN-NEXT: s_not_b64 exec, exec
219+ ; GCN-NEXT: s_mov_b32 s2, -1
220+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
221+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
222+ ; GCN-NEXT: s_endpgm
223+ %tmp = call <2 x float > @llvm.amdgcn.set.inactive.v2f32 (<2 x float > %in , <2 x float > <float 1 .0 , float 1 .0 >) #0
224+ store <2 x float > %tmp , ptr addrspace (1 ) %out
225+ ret void
226+ }
227+
228+ define amdgpu_kernel void @set_inactive_v2bf16 (ptr addrspace (1 ) %out , <2 x bfloat> %in ) {
229+ ; GCN-LABEL: set_inactive_v2bf16:
230+ ; GCN: ; %bb.0:
231+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
232+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
233+ ; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
234+ ; GCN-NEXT: s_mov_b32 s2, -1
235+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
236+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
237+ ; GCN-NEXT: s_not_b64 exec, exec
238+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
239+ ; GCN-NEXT: s_not_b64 exec, exec
240+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
241+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
242+ ; GCN-NEXT: s_endpgm
243+ %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16 (<2 x bfloat> %in , <2 x bfloat> <bfloat 1 .0 , bfloat 1 .0 >) #0
244+ store <2 x bfloat> %tmp , ptr addrspace (1 ) %out
245+ ret void
246+ }
247+
248+ define amdgpu_kernel void @set_inactive_v4i16 (ptr addrspace (1 ) %out , <4 x i16 > %in ) {
249+ ; GCN-LABEL: set_inactive_v4i16:
250+ ; GCN: ; %bb.0:
251+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
252+ ; GCN-NEXT: s_mov_b32 s4, 0x10001
253+ ; GCN-NEXT: s_mov_b32 s5, s4
254+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
255+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
256+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
257+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
258+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
259+ ; GCN-NEXT: s_not_b64 exec, exec
260+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
261+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
262+ ; GCN-NEXT: s_not_b64 exec, exec
263+ ; GCN-NEXT: s_mov_b32 s2, -1
264+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
265+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
266+ ; GCN-NEXT: s_endpgm
267+ %tmp = call <4 x i16 > @llvm.amdgcn.set.inactive.v4i16 (<4 x i16 > %in , <4 x i16 > <i16 1 , i16 1 , i16 1 , i16 1 >) #0
268+ store <4 x i16 > %tmp , ptr addrspace (1 ) %out
269+ ret void
270+ }
271+
272+ define amdgpu_kernel void @set_inactive_v4f16 (ptr addrspace (1 ) %out , <4 x half > %in ) {
273+ ; GCN-LABEL: set_inactive_v4f16:
274+ ; GCN: ; %bb.0:
275+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
276+ ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00
277+ ; GCN-NEXT: s_mov_b32 s5, s4
278+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
279+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
280+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
281+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
282+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
283+ ; GCN-NEXT: s_not_b64 exec, exec
284+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
285+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
286+ ; GCN-NEXT: s_not_b64 exec, exec
287+ ; GCN-NEXT: s_mov_b32 s2, -1
288+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
289+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
290+ ; GCN-NEXT: s_endpgm
291+ %tmp = call <4 x half > @llvm.amdgcn.set.inactive.v4f16 (<4 x half > %in , <4 x half > <half 1 .0 , half 1 .0 , half 1 .0 , half 1 .0 >) #0
292+ store <4 x half > %tmp , ptr addrspace (1 ) %out
293+ ret void
294+ }
295+
296+ define amdgpu_kernel void @set_inactive_v4bf16 (ptr addrspace (1 ) %out , <4 x bfloat> %in ) {
297+ ; GCN-LABEL: set_inactive_v4bf16:
298+ ; GCN: ; %bb.0:
299+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
300+ ; GCN-NEXT: s_mov_b32 s4, 0x3f803f80
301+ ; GCN-NEXT: s_mov_b32 s5, s4
302+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
303+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
304+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
305+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
306+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
307+ ; GCN-NEXT: s_not_b64 exec, exec
308+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
309+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
310+ ; GCN-NEXT: s_not_b64 exec, exec
311+ ; GCN-NEXT: s_mov_b32 s2, -1
312+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
313+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
314+ ; GCN-NEXT: s_endpgm
315+ %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16 (<4 x bfloat> %in , <4 x bfloat> <bfloat 1 .0 , bfloat 1 .0 , bfloat 1 .0 , bfloat 1 .0 >) #0
316+ store <4 x bfloat> %tmp , ptr addrspace (1 ) %out
317+ ret void
318+ }
319+
320+ define amdgpu_kernel void @set_inactive_p0 (ptr addrspace (1 ) %out , ptr %in ) {
321+ ; GCN-LABEL: set_inactive_p0:
322+ ; GCN: ; %bb.0:
323+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
324+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
325+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
326+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
327+ ; GCN-NEXT: s_not_b64 exec, exec
328+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
329+ ; GCN-NEXT: v_mov_b32_e32 v1, 0
330+ ; GCN-NEXT: s_not_b64 exec, exec
331+ ; GCN-NEXT: s_mov_b32 s2, -1
332+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
333+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
334+ ; GCN-NEXT: s_endpgm
335+ %tmp = call ptr @llvm.amdgcn.set.inactive.p0 (ptr %in , ptr null ) #0
336+ store ptr %tmp , ptr addrspace (1 ) %out
337+ ret void
338+ }
339+
340+ define amdgpu_kernel void @set_inactive_p2 (ptr addrspace (1 ) %out , ptr addrspace (2 ) %in ) {
341+ ; GCN-LABEL: set_inactive_p2:
342+ ; GCN: ; %bb.0:
343+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
344+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
345+ ; GCN-NEXT: s_mov_b32 s2, -1
346+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
347+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
348+ ; GCN-NEXT: s_not_b64 exec, exec
349+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
350+ ; GCN-NEXT: s_not_b64 exec, exec
351+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
352+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
353+ ; GCN-NEXT: s_endpgm
354+ %tmp = call ptr addrspace (2 ) @llvm.amdgcn.set.inactive.p2 (ptr addrspace (2 ) %in , ptr addrspace (2 ) null ) #0
355+ store ptr addrspace (2 ) %tmp , ptr addrspace (1 ) %out
356+ ret void
357+ }
358+
359+ define amdgpu_kernel void @set_inactive_p3 (ptr addrspace (1 ) %out , ptr addrspace (3 ) %in ) {
360+ ; GCN-LABEL: set_inactive_p3:
361+ ; GCN: ; %bb.0:
362+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
363+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
364+ ; GCN-NEXT: s_mov_b32 s2, -1
365+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
366+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
367+ ; GCN-NEXT: s_not_b64 exec, exec
368+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
369+ ; GCN-NEXT: s_not_b64 exec, exec
370+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
371+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
372+ ; GCN-NEXT: s_endpgm
373+ %tmp = call ptr addrspace (3 ) @llvm.amdgcn.set.inactive.p3 (ptr addrspace (3 ) %in , ptr addrspace (3 ) null ) #0
374+ store ptr addrspace (3 ) %tmp , ptr addrspace (1 ) %out
375+ ret void
376+ }
377+
378+ define amdgpu_kernel void @set_inactive_p5 (ptr addrspace (1 ) %out , ptr addrspace (5 ) %in ) {
379+ ; GCN-LABEL: set_inactive_p5:
380+ ; GCN: ; %bb.0:
381+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
382+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
383+ ; GCN-NEXT: s_mov_b32 s2, -1
384+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
385+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
386+ ; GCN-NEXT: s_not_b64 exec, exec
387+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
388+ ; GCN-NEXT: s_not_b64 exec, exec
389+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
390+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
391+ ; GCN-NEXT: s_endpgm
392+ %tmp = call ptr addrspace (5 ) @llvm.amdgcn.set.inactive.p5 (ptr addrspace (5 ) %in , ptr addrspace (5 ) null ) #0
393+ store ptr addrspace (5 ) %tmp , ptr addrspace (1 ) %out
394+ ret void
395+ }
396+
397+ define amdgpu_kernel void @set_inactive_p6 (ptr addrspace (1 ) %out , ptr addrspace (6 ) %in ) {
398+ ; GCN-LABEL: set_inactive_p6:
399+ ; GCN: ; %bb.0:
400+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
401+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
402+ ; GCN-NEXT: s_mov_b32 s2, -1
403+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
404+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
405+ ; GCN-NEXT: s_not_b64 exec, exec
406+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
407+ ; GCN-NEXT: s_not_b64 exec, exec
408+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
409+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
410+ ; GCN-NEXT: s_endpgm
411+ %tmp = call ptr addrspace (6 ) @llvm.amdgcn.set.inactive.p6 (ptr addrspace (6 ) %in , ptr addrspace (6 ) null ) #0
412+ store ptr addrspace (6 ) %tmp , ptr addrspace (1 ) %out
413+ ret void
414+ }
415+
96416declare i32 @llvm.amdgcn.set.inactive.i32 (i32 , i32 ) #0
97417declare i64 @llvm.amdgcn.set.inactive.i64 (i64 , i64 ) #0
98418declare i32 @llvm.amdgcn.s.buffer.load.i32 (<4 x i32 >, i32 , i32 )
0 commit comments