1- ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
2- ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
1+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3+ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
34
4- ; GCN-LABEL: {{^}}trunc_i64_bitcast_v2i32:
5- ; GCN: buffer_load_dword v
6- ; GCN: buffer_store_dword v
75define amdgpu_kernel void @trunc_i64_bitcast_v2i32 (ptr addrspace (1 ) %out , ptr addrspace (1 ) %in ) {
6+ ; SI-LABEL: trunc_i64_bitcast_v2i32:
7+ ; SI: ; %bb.0:
8+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
9+ ; SI-NEXT: s_mov_b32 s7, 0xf000
10+ ; SI-NEXT: s_mov_b32 s6, -1
11+ ; SI-NEXT: s_mov_b32 s10, s6
12+ ; SI-NEXT: s_mov_b32 s11, s7
13+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
14+ ; SI-NEXT: s_mov_b32 s8, s2
15+ ; SI-NEXT: s_mov_b32 s9, s3
16+ ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
17+ ; SI-NEXT: s_mov_b32 s4, s0
18+ ; SI-NEXT: s_mov_b32 s5, s1
19+ ; SI-NEXT: s_waitcnt vmcnt(0)
20+ ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
21+ ; SI-NEXT: s_endpgm
22+ ;
23+ ; VI-LABEL: trunc_i64_bitcast_v2i32:
24+ ; VI: ; %bb.0:
25+ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
26+ ; VI-NEXT: s_mov_b32 s7, 0xf000
27+ ; VI-NEXT: s_mov_b32 s6, -1
28+ ; VI-NEXT: s_mov_b32 s10, s6
29+ ; VI-NEXT: s_mov_b32 s11, s7
30+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
31+ ; VI-NEXT: s_mov_b32 s8, s2
32+ ; VI-NEXT: s_mov_b32 s9, s3
33+ ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
34+ ; VI-NEXT: s_mov_b32 s4, s0
35+ ; VI-NEXT: s_mov_b32 s5, s1
36+ ; VI-NEXT: s_waitcnt vmcnt(0)
37+ ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
38+ ; VI-NEXT: s_endpgm
839 %ld = load <2 x i32 >, ptr addrspace (1 ) %in
940 %bc = bitcast <2 x i32 > %ld to i64
1041 %trunc = trunc i64 %bc to i32
1142 store i32 %trunc , ptr addrspace (1 ) %out
1243 ret void
1344}
1445
15- ; GCN-LABEL: {{^}}trunc_i96_bitcast_v3i32:
16- ; GCN: buffer_load_dword v
17- ; GCN: buffer_store_dword v
1846define amdgpu_kernel void @trunc_i96_bitcast_v3i32 (ptr addrspace (1 ) %out , ptr addrspace (1 ) %in ) {
47+ ; SI-LABEL: trunc_i96_bitcast_v3i32:
48+ ; SI: ; %bb.0:
49+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
50+ ; SI-NEXT: s_mov_b32 s7, 0xf000
51+ ; SI-NEXT: s_mov_b32 s6, -1
52+ ; SI-NEXT: s_mov_b32 s10, s6
53+ ; SI-NEXT: s_mov_b32 s11, s7
54+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
55+ ; SI-NEXT: s_mov_b32 s8, s2
56+ ; SI-NEXT: s_mov_b32 s9, s3
57+ ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
58+ ; SI-NEXT: s_mov_b32 s4, s0
59+ ; SI-NEXT: s_mov_b32 s5, s1
60+ ; SI-NEXT: s_waitcnt vmcnt(0)
61+ ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
62+ ; SI-NEXT: s_endpgm
63+ ;
64+ ; VI-LABEL: trunc_i96_bitcast_v3i32:
65+ ; VI: ; %bb.0:
66+ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
67+ ; VI-NEXT: s_mov_b32 s7, 0xf000
68+ ; VI-NEXT: s_mov_b32 s6, -1
69+ ; VI-NEXT: s_mov_b32 s10, s6
70+ ; VI-NEXT: s_mov_b32 s11, s7
71+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
72+ ; VI-NEXT: s_mov_b32 s8, s2
73+ ; VI-NEXT: s_mov_b32 s9, s3
74+ ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
75+ ; VI-NEXT: s_mov_b32 s4, s0
76+ ; VI-NEXT: s_mov_b32 s5, s1
77+ ; VI-NEXT: s_waitcnt vmcnt(0)
78+ ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
79+ ; VI-NEXT: s_endpgm
1980 %ld = load <3 x i32 >, ptr addrspace (1 ) %in
2081 %bc = bitcast <3 x i32 > %ld to i96
2182 %trunc = trunc i96 %bc to i32
2283 store i32 %trunc , ptr addrspace (1 ) %out
2384 ret void
2485}
2586
26- ; GCN-LABEL: {{^}}trunc_i128_bitcast_v4i32:
27- ; GCN: buffer_load_dword v
28- ; GCN: buffer_store_dword v
2987define amdgpu_kernel void @trunc_i128_bitcast_v4i32 (ptr addrspace (1 ) %out , ptr addrspace (1 ) %in ) {
88+ ; SI-LABEL: trunc_i128_bitcast_v4i32:
89+ ; SI: ; %bb.0:
90+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
91+ ; SI-NEXT: s_mov_b32 s7, 0xf000
92+ ; SI-NEXT: s_mov_b32 s6, -1
93+ ; SI-NEXT: s_mov_b32 s10, s6
94+ ; SI-NEXT: s_mov_b32 s11, s7
95+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
96+ ; SI-NEXT: s_mov_b32 s8, s2
97+ ; SI-NEXT: s_mov_b32 s9, s3
98+ ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
99+ ; SI-NEXT: s_mov_b32 s4, s0
100+ ; SI-NEXT: s_mov_b32 s5, s1
101+ ; SI-NEXT: s_waitcnt vmcnt(0)
102+ ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
103+ ; SI-NEXT: s_endpgm
104+ ;
105+ ; VI-LABEL: trunc_i128_bitcast_v4i32:
106+ ; VI: ; %bb.0:
107+ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
108+ ; VI-NEXT: s_mov_b32 s7, 0xf000
109+ ; VI-NEXT: s_mov_b32 s6, -1
110+ ; VI-NEXT: s_mov_b32 s10, s6
111+ ; VI-NEXT: s_mov_b32 s11, s7
112+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
113+ ; VI-NEXT: s_mov_b32 s8, s2
114+ ; VI-NEXT: s_mov_b32 s9, s3
115+ ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
116+ ; VI-NEXT: s_mov_b32 s4, s0
117+ ; VI-NEXT: s_mov_b32 s5, s1
118+ ; VI-NEXT: s_waitcnt vmcnt(0)
119+ ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
120+ ; VI-NEXT: s_endpgm
30121 %ld = load <4 x i32 >, ptr addrspace (1 ) %in
31122 %bc = bitcast <4 x i32 > %ld to i128
32123 %trunc = trunc i128 %bc to i32
@@ -35,25 +126,85 @@ define amdgpu_kernel void @trunc_i128_bitcast_v4i32(ptr addrspace(1) %out, ptr a
35126}
36127
37128; Don't want load width reduced in this case.
38- ; GCN-LABEL: {{^}}trunc_i16_bitcast_v2i16:
39- ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
40- ; GCN: buffer_store_short [[VAL]]
41129define amdgpu_kernel void @trunc_i16_bitcast_v2i16 (ptr addrspace (1 ) %out , ptr addrspace (1 ) %in ) {
130+ ; SI-LABEL: trunc_i16_bitcast_v2i16:
131+ ; SI: ; %bb.0:
132+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
133+ ; SI-NEXT: s_mov_b32 s7, 0xf000
134+ ; SI-NEXT: s_mov_b32 s6, -1
135+ ; SI-NEXT: s_mov_b32 s10, s6
136+ ; SI-NEXT: s_mov_b32 s11, s7
137+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
138+ ; SI-NEXT: s_mov_b32 s8, s2
139+ ; SI-NEXT: s_mov_b32 s9, s3
140+ ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
141+ ; SI-NEXT: s_mov_b32 s4, s0
142+ ; SI-NEXT: s_mov_b32 s5, s1
143+ ; SI-NEXT: s_waitcnt vmcnt(0)
144+ ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
145+ ; SI-NEXT: s_endpgm
146+ ;
147+ ; VI-LABEL: trunc_i16_bitcast_v2i16:
148+ ; VI: ; %bb.0:
149+ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
150+ ; VI-NEXT: s_mov_b32 s7, 0xf000
151+ ; VI-NEXT: s_mov_b32 s6, -1
152+ ; VI-NEXT: s_mov_b32 s10, s6
153+ ; VI-NEXT: s_mov_b32 s11, s7
154+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
155+ ; VI-NEXT: s_mov_b32 s8, s2
156+ ; VI-NEXT: s_mov_b32 s9, s3
157+ ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
158+ ; VI-NEXT: s_mov_b32 s4, s0
159+ ; VI-NEXT: s_mov_b32 s5, s1
160+ ; VI-NEXT: s_waitcnt vmcnt(0)
161+ ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
162+ ; VI-NEXT: s_endpgm
42163 %ld = load <2 x i16 >, ptr addrspace (1 ) %in
43164 %bc = bitcast <2 x i16 > %ld to i32
44165 %trunc = trunc i32 %bc to i16
45166 store i16 %trunc , ptr addrspace (1 ) %out
46167 ret void
47168}
48169
49- ; GCN-LABEL: {{^}}trunc_i16_bitcast_v4i16:
50170; FIXME We need to teach the dagcombiner to reduce load width for:
51171; t21: v2i32,ch = load<LD8[%in(addrspace=1)]> t12, t10, undef:i64
52172; t23: i64 = bitcast t21
53173; t30: i16 = truncate t23
54- ; GCN: buffer_load_dword v[[VAL:[0-9]+]]
55- ; GCN: buffer_store_short v[[VAL]], off
56174define amdgpu_kernel void @trunc_i16_bitcast_v4i16 (ptr addrspace (1 ) %out , ptr addrspace (1 ) %in ) {
175+ ; SI-LABEL: trunc_i16_bitcast_v4i16:
176+ ; SI: ; %bb.0:
177+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
178+ ; SI-NEXT: s_mov_b32 s7, 0xf000
179+ ; SI-NEXT: s_mov_b32 s6, -1
180+ ; SI-NEXT: s_mov_b32 s10, s6
181+ ; SI-NEXT: s_mov_b32 s11, s7
182+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
183+ ; SI-NEXT: s_mov_b32 s8, s2
184+ ; SI-NEXT: s_mov_b32 s9, s3
185+ ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
186+ ; SI-NEXT: s_mov_b32 s4, s0
187+ ; SI-NEXT: s_mov_b32 s5, s1
188+ ; SI-NEXT: s_waitcnt vmcnt(0)
189+ ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
190+ ; SI-NEXT: s_endpgm
191+ ;
192+ ; VI-LABEL: trunc_i16_bitcast_v4i16:
193+ ; VI: ; %bb.0:
194+ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
195+ ; VI-NEXT: s_mov_b32 s7, 0xf000
196+ ; VI-NEXT: s_mov_b32 s6, -1
197+ ; VI-NEXT: s_mov_b32 s10, s6
198+ ; VI-NEXT: s_mov_b32 s11, s7
199+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
200+ ; VI-NEXT: s_mov_b32 s8, s2
201+ ; VI-NEXT: s_mov_b32 s9, s3
202+ ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
203+ ; VI-NEXT: s_mov_b32 s4, s0
204+ ; VI-NEXT: s_mov_b32 s5, s1
205+ ; VI-NEXT: s_waitcnt vmcnt(0)
206+ ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
207+ ; VI-NEXT: s_endpgm
57208 %ld = load <4 x i16 >, ptr addrspace (1 ) %in
58209 %bc = bitcast <4 x i16 > %ld to i64
59210 %trunc = trunc i64 %bc to i16
@@ -62,33 +213,122 @@ define amdgpu_kernel void @trunc_i16_bitcast_v4i16(ptr addrspace(1) %out, ptr ad
62213}
63214
64215; FIXME: Consistently shrink or not here
65- ; GCN-LABEL: {{^}}trunc_i8_bitcast_v2i8:
66- ; SI: buffer_load_ubyte [[VAL:v[0-9]+]]
67- ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
68- ; GCN: buffer_store_byte [[VAL]]
69216define amdgpu_kernel void @trunc_i8_bitcast_v2i8 (ptr addrspace (1 ) %out , ptr addrspace (1 ) %in ) {
217+ ; SI-LABEL: trunc_i8_bitcast_v2i8:
218+ ; SI: ; %bb.0:
219+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
220+ ; SI-NEXT: s_mov_b32 s7, 0xf000
221+ ; SI-NEXT: s_mov_b32 s6, -1
222+ ; SI-NEXT: s_mov_b32 s10, s6
223+ ; SI-NEXT: s_mov_b32 s11, s7
224+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
225+ ; SI-NEXT: s_mov_b32 s8, s2
226+ ; SI-NEXT: s_mov_b32 s9, s3
227+ ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
228+ ; SI-NEXT: s_mov_b32 s4, s0
229+ ; SI-NEXT: s_mov_b32 s5, s1
230+ ; SI-NEXT: s_waitcnt vmcnt(0)
231+ ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
232+ ; SI-NEXT: s_endpgm
233+ ;
234+ ; VI-LABEL: trunc_i8_bitcast_v2i8:
235+ ; VI: ; %bb.0:
236+ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
237+ ; VI-NEXT: s_mov_b32 s7, 0xf000
238+ ; VI-NEXT: s_mov_b32 s6, -1
239+ ; VI-NEXT: s_mov_b32 s10, s6
240+ ; VI-NEXT: s_mov_b32 s11, s7
241+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
242+ ; VI-NEXT: s_mov_b32 s8, s2
243+ ; VI-NEXT: s_mov_b32 s9, s3
244+ ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
245+ ; VI-NEXT: s_mov_b32 s4, s0
246+ ; VI-NEXT: s_mov_b32 s5, s1
247+ ; VI-NEXT: s_waitcnt vmcnt(0)
248+ ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
249+ ; VI-NEXT: s_endpgm
70250 %ld = load <2 x i8 >, ptr addrspace (1 ) %in
71251 %bc = bitcast <2 x i8 > %ld to i16
72252 %trunc = trunc i16 %bc to i8
73253 store i8 %trunc , ptr addrspace (1 ) %out
74254 ret void
75255}
76256
77- ; GCN-LABEL: {{^}}trunc_i32_bitcast_v4i8:
78- ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
79- ; GCN: buffer_store_byte [[VAL]]
80257define amdgpu_kernel void @trunc_i32_bitcast_v4i8 (ptr addrspace (1 ) %out , ptr addrspace (1 ) %in ) {
258+ ; SI-LABEL: trunc_i32_bitcast_v4i8:
259+ ; SI: ; %bb.0:
260+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
261+ ; SI-NEXT: s_mov_b32 s7, 0xf000
262+ ; SI-NEXT: s_mov_b32 s6, -1
263+ ; SI-NEXT: s_mov_b32 s10, s6
264+ ; SI-NEXT: s_mov_b32 s11, s7
265+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
266+ ; SI-NEXT: s_mov_b32 s8, s2
267+ ; SI-NEXT: s_mov_b32 s9, s3
268+ ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
269+ ; SI-NEXT: s_mov_b32 s4, s0
270+ ; SI-NEXT: s_mov_b32 s5, s1
271+ ; SI-NEXT: s_waitcnt vmcnt(0)
272+ ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
273+ ; SI-NEXT: s_endpgm
274+ ;
275+ ; VI-LABEL: trunc_i32_bitcast_v4i8:
276+ ; VI: ; %bb.0:
277+ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
278+ ; VI-NEXT: s_mov_b32 s7, 0xf000
279+ ; VI-NEXT: s_mov_b32 s6, -1
280+ ; VI-NEXT: s_mov_b32 s10, s6
281+ ; VI-NEXT: s_mov_b32 s11, s7
282+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
283+ ; VI-NEXT: s_mov_b32 s8, s2
284+ ; VI-NEXT: s_mov_b32 s9, s3
285+ ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
286+ ; VI-NEXT: s_mov_b32 s4, s0
287+ ; VI-NEXT: s_mov_b32 s5, s1
288+ ; VI-NEXT: s_waitcnt vmcnt(0)
289+ ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
290+ ; VI-NEXT: s_endpgm
81291 %ld = load <4 x i8 >, ptr addrspace (1 ) %in
82292 %bc = bitcast <4 x i8 > %ld to i32
83293 %trunc = trunc i32 %bc to i8
84294 store i8 %trunc , ptr addrspace (1 ) %out
85295 ret void
86296}
87297
88- ; GCN-LABEL: {{^}}trunc_i24_bitcast_v3i8:
89- ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
90- ; GCN: buffer_store_byte [[VAL]]
91298define amdgpu_kernel void @trunc_i24_bitcast_v3i8 (ptr addrspace (1 ) %out , ptr addrspace (1 ) %in ) {
299+ ; SI-LABEL: trunc_i24_bitcast_v3i8:
300+ ; SI: ; %bb.0:
301+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
302+ ; SI-NEXT: s_mov_b32 s7, 0xf000
303+ ; SI-NEXT: s_mov_b32 s6, -1
304+ ; SI-NEXT: s_mov_b32 s10, s6
305+ ; SI-NEXT: s_mov_b32 s11, s7
306+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
307+ ; SI-NEXT: s_mov_b32 s8, s2
308+ ; SI-NEXT: s_mov_b32 s9, s3
309+ ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
310+ ; SI-NEXT: s_mov_b32 s4, s0
311+ ; SI-NEXT: s_mov_b32 s5, s1
312+ ; SI-NEXT: s_waitcnt vmcnt(0)
313+ ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
314+ ; SI-NEXT: s_endpgm
315+ ;
316+ ; VI-LABEL: trunc_i24_bitcast_v3i8:
317+ ; VI: ; %bb.0:
318+ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
319+ ; VI-NEXT: s_mov_b32 s7, 0xf000
320+ ; VI-NEXT: s_mov_b32 s6, -1
321+ ; VI-NEXT: s_mov_b32 s10, s6
322+ ; VI-NEXT: s_mov_b32 s11, s7
323+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
324+ ; VI-NEXT: s_mov_b32 s8, s2
325+ ; VI-NEXT: s_mov_b32 s9, s3
326+ ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
327+ ; VI-NEXT: s_mov_b32 s4, s0
328+ ; VI-NEXT: s_mov_b32 s5, s1
329+ ; VI-NEXT: s_waitcnt vmcnt(0)
330+ ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
331+ ; VI-NEXT: s_endpgm
92332 %ld = load <3 x i8 >, ptr addrspace (1 ) %in
93333 %bc = bitcast <3 x i8 > %ld to i24
94334 %trunc = trunc i24 %bc to i8
0 commit comments