22;; Test that carryout from 64-bit add/sub (synthesized from two 32-bit adds/subs) is utilized
33;; (i.e. no additional compare is generated).
44
5- ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
5+ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
66
77%0 = type { i64 , i64 , i32 , i32 }
88%1 = type { [64 x [8 x i64 ]] }
@@ -189,21 +189,19 @@ define i64 @v_usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
189189; test SGPR
190190;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
191191
192- define %struct.uint96 @s_add64_32 (i64 inreg %val64A , i64 inreg %val64B , i32 inreg %val32 ) {
192+ define amdgpu_ps %struct.uint96 @s_add64_32 (i64 inreg %val64A , i64 inreg %val64B , i32 inreg %val32 ) {
193193; CHECK-LABEL: s_add64_32:
194194; CHECK: ; %bb.0:
195- ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196- ; CHECK-NEXT: s_add_u32 s4, s16, s18
197- ; CHECK-NEXT: v_mov_b32_e32 v0, s16
198- ; CHECK-NEXT: s_addc_u32 s5, s17, s19
199- ; CHECK-NEXT: v_mov_b32_e32 v1, s17
200- ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
201- ; CHECK-NEXT: v_mov_b32_e32 v0, s4
195+ ; CHECK-NEXT: s_add_u32 s6, s0, s2
196+ ; CHECK-NEXT: v_mov_b32_e32 v0, s0
197+ ; CHECK-NEXT: s_addc_u32 s7, s1, s3
198+ ; CHECK-NEXT: v_mov_b32_e32 v1, s1
199+ ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
200+ ; CHECK-NEXT: s_mov_b32 s0, s6
202201; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
203- ; CHECK-NEXT: s_addc_u32 s6, s20, 0
204- ; CHECK-NEXT: v_mov_b32_e32 v1, s5
205- ; CHECK-NEXT: v_mov_b32_e32 v2, s6
206- ; CHECK-NEXT: s_setpc_b64 s[30:31]
202+ ; CHECK-NEXT: s_addc_u32 s2, s4, 0
203+ ; CHECK-NEXT: s_mov_b32 s1, s7
204+ ; CHECK-NEXT: ; return to shader part epilog
207205 %sum64 = add i64 %val64A , %val64B
208206 %obit = icmp ult i64 %sum64 , %val64A
209207 %obit32 = zext i1 %obit to i32
@@ -213,32 +211,32 @@ define %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B, i32 inre
213211 ret %struct.uint96 %.fca.1.insert
214212}
215213
216- define <2 x i64 > @s_uadd_v2i64 (<2 x i64 > inreg %val0 , <2 x i64 > inreg %val1 , ptr %ptrval ) {
214+ define amdgpu_ps <2 x i64 > @s_uadd_v2i64 (<2 x i64 > inreg %val0 , <2 x i64 > inreg %val1 , ptr %ptrval ) {
217215; CHECK-LABEL: s_uadd_v2i64:
218216; CHECK: ; %bb.0:
219- ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220- ; CHECK-NEXT: s_add_u32 s4, s18, s22
221- ; CHECK-NEXT: s_addc_u32 s5, s19, s23
222- ; CHECK-NEXT: s_add_u32 s6, s16, s20
223- ; CHECK-NEXT: v_mov_b32_e32 v2, s16
224- ; CHECK-NEXT: s_addc_u32 s7, s17, s21
225- ; CHECK-NEXT: v_mov_b32_e32 v3, s17
226- ; CHECK-NEXT: v_mov_b32_e32 v8, s18
227- ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
228- ; CHECK-NEXT: v_mov_b32_e32 v9, s19
229- ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
230- ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
217+ ; CHECK-NEXT: s_add_u32 s6, s2, s6
218+ ; CHECK-NEXT: v_mov_b32_e32 v9, s3
219+ ; CHECK-NEXT: s_addc_u32 s7, s3, s7
220+ ; CHECK-NEXT: v_mov_b32_e32 v8, s2
221+ ; CHECK-NEXT: s_add_u32 s4, s0, s4
222+ ; CHECK-NEXT: v_mov_b32_e32 v7, s1
223+ ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
224+ ; CHECK-NEXT: s_addc_u32 s5, s1, s5
225+ ; CHECK-NEXT: v_mov_b32_e32 v6, s0
226+ ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
227+ ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
228+ ; CHECK-NEXT: v_readfirstlane_b32 s2, v8
229+ ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
230+ ; CHECK-NEXT: v_readfirstlane_b32 s0, v6
231+ ; CHECK-NEXT: v_mov_b32_e32 v2, s4
232+ ; CHECK-NEXT: v_mov_b32_e32 v3, s5
231233; CHECK-NEXT: v_mov_b32_e32 v4, s6
232234; CHECK-NEXT: v_mov_b32_e32 v5, s7
233- ; CHECK-NEXT: v_mov_b32_e32 v6, s4
234- ; CHECK-NEXT: v_mov_b32_e32 v7, s5
235- ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
236- ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
237- ; CHECK-NEXT: v_mov_b32_e32 v0, v2
238- ; CHECK-NEXT: v_mov_b32_e32 v1, v2
239- ; CHECK-NEXT: v_mov_b32_e32 v2, v3
235+ ; CHECK-NEXT: s_mov_b32 s1, s0
236+ ; CHECK-NEXT: s_mov_b32 s3, s2
237+ ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
240238; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
241- ; CHECK-NEXT: s_setpc_b64 s[30:31]
239+ ; CHECK-NEXT: ; return to shader part epilog
242240 %pair = call {<2 x i64 >, <2 x i1 >} @llvm.uadd.with.overflow.v2i64 (<2 x i64 > %val0 , <2 x i64 > %val1 )
243241 %val = extractvalue {<2 x i64 >, <2 x i1 >} %pair , 0
244242 %obit = extractvalue {<2 x i64 >, <2 x i1 >} %pair , 1
@@ -247,32 +245,32 @@ define <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr
247245 ret <2 x i64 > %res
248246}
249247
250- define <2 x i64 > @s_usub_v2i64 (<2 x i64 > inreg %val0 , <2 x i64 > inreg %val1 , ptr %ptrval ) {
248+ define amdgpu_ps <2 x i64 > @s_usub_v2i64 (<2 x i64 > inreg %val0 , <2 x i64 > inreg %val1 , ptr %ptrval ) {
251249; CHECK-LABEL: s_usub_v2i64:
252250; CHECK: ; %bb.0:
253- ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254- ; CHECK-NEXT: s_sub_u32 s4, s18, s22
255- ; CHECK-NEXT: s_subb_u32 s5, s19, s23
256- ; CHECK-NEXT: s_sub_u32 s6, s16, s20
257- ; CHECK-NEXT: v_mov_b32_e32 v2, s16
258- ; CHECK-NEXT: s_subb_u32 s7, s17, s21
259- ; CHECK-NEXT: v_mov_b32_e32 v3, s17
260- ; CHECK-NEXT: v_mov_b32_e32 v8, s18
261- ; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
262- ; CHECK-NEXT: v_mov_b32_e32 v9, s19
263- ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
264- ; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[8:9]
251+ ; CHECK-NEXT: s_sub_u32 s6, s2, s6
252+ ; CHECK-NEXT: v_mov_b32_e32 v9, s3
253+ ; CHECK-NEXT: s_subb_u32 s7, s3, s7
254+ ; CHECK-NEXT: v_mov_b32_e32 v8, s2
255+ ; CHECK-NEXT: s_sub_u32 s4, s0, s4
256+ ; CHECK-NEXT: v_mov_b32_e32 v7, s1
257+ ; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[8:9]
258+ ; CHECK-NEXT: s_subb_u32 s5, s1, s5
259+ ; CHECK-NEXT: v_mov_b32_e32 v6, s0
260+ ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
261+ ; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[6:7]
262+ ; CHECK-NEXT: v_readfirstlane_b32 s2, v8
263+ ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
264+ ; CHECK-NEXT: v_readfirstlane_b32 s0, v6
265+ ; CHECK-NEXT: v_mov_b32_e32 v2, s4
266+ ; CHECK-NEXT: v_mov_b32_e32 v3, s5
265267; CHECK-NEXT: v_mov_b32_e32 v4, s6
266268; CHECK-NEXT: v_mov_b32_e32 v5, s7
267- ; CHECK-NEXT: v_mov_b32_e32 v6, s4
268- ; CHECK-NEXT: v_mov_b32_e32 v7, s5
269- ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
270- ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
271- ; CHECK-NEXT: v_mov_b32_e32 v0, v2
272- ; CHECK-NEXT: v_mov_b32_e32 v1, v2
273- ; CHECK-NEXT: v_mov_b32_e32 v2, v3
269+ ; CHECK-NEXT: s_mov_b32 s1, s0
270+ ; CHECK-NEXT: s_mov_b32 s3, s2
271+ ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
274272; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
275- ; CHECK-NEXT: s_setpc_b64 s[30:31]
273+ ; CHECK-NEXT: ; return to shader part epilog
276274 %pair = call {<2 x i64 >, <2 x i1 >} @llvm.usub.with.overflow.v2i64 (<2 x i64 > %val0 , <2 x i64 > %val1 )
277275 %val = extractvalue {<2 x i64 >, <2 x i1 >} %pair , 0
278276 %obit = extractvalue {<2 x i64 >, <2 x i1 >} %pair , 1
@@ -281,22 +279,22 @@ define <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr
281279 ret <2 x i64 > %res
282280}
283281
284- define i64 @s_uadd_i64 (i64 inreg %val0 , i64 inreg %val1 , ptr %ptrval ) {
282+ define amdgpu_ps i64 @s_uadd_i64 (i64 inreg %val0 , i64 inreg %val1 , ptr %ptrval ) {
285283; CHECK-LABEL: s_uadd_i64:
286284; CHECK: ; %bb.0:
287- ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288- ; CHECK-NEXT: s_add_u32 s4, s16, s18
289- ; CHECK-NEXT: v_mov_b32_e32 v2, s16
290- ; CHECK-NEXT: s_addc_u32 s5, s17, s19
291- ; CHECK-NEXT: v_mov_b32_e32 v3, s17
292- ; CHECK-NEXT: v_mov_b32_e32 v4, s4
293- ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
294- ; CHECK-NEXT: v_mov_b32_e32 v5, s5
285+ ; CHECK-NEXT: s_add_u32 s2, s0, s2
286+ ; CHECK-NEXT: v_mov_b32_e32 v3, s1
287+ ; CHECK-NEXT: s_addc_u32 s3, s1, s3
288+ ; CHECK-NEXT: v_mov_b32_e32 v2, s0
289+ ; CHECK-NEXT: v_mov_b32_e32 v5, s3
290+ ; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
291+ ; CHECK-NEXT: v_mov_b32_e32 v4, s2
295292; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
296293; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
297- ; CHECK-NEXT: v_mov_b32_e32 v1, v0
294+ ; CHECK-NEXT: v_readfirstlane_b32 s0, v0
295+ ; CHECK-NEXT: s_mov_b32 s1, s0
298296; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
299- ; CHECK-NEXT: s_setpc_b64 s[30:31]
297+ ; CHECK-NEXT: ; return to shader part epilog
300298 %pair = call {i64 , i1 } @llvm.uadd.with.overflow.i64 (i64 %val0 , i64 %val1 )
301299 %val = extractvalue {i64 , i1 } %pair , 0
302300 %obit = extractvalue {i64 , i1 } %pair , 1
@@ -305,21 +303,21 @@ define i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
305303 ret i64 %res
306304}
307305
308- define i64 @s_uadd_p1 (i64 inreg %val0 , i64 inreg %val1 , ptr %ptrval ) {
306+ define amdgpu_ps i64 @s_uadd_p1 (i64 inreg %val0 , i64 inreg %val1 , ptr %ptrval ) {
309307; CHECK-LABEL: s_uadd_p1:
310308; CHECK: ; %bb.0:
311- ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312- ; CHECK-NEXT: s_add_u32 s4, s16, 1
313- ; CHECK-NEXT: s_addc_u32 s5, s17, 0
314- ; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0
315- ; CHECK-NEXT: v_mov_b32_e32 v2, s4
316- ; CHECK-NEXT: v_mov_b32_e32 v3, s5
317- ; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
309+ ; CHECK-NEXT: s_add_u32 s0, s0, 1
310+ ; CHECK-NEXT: s_addc_u32 s1, s1, 0
311+ ; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
312+ ; CHECK-NEXT: v_mov_b32_e32 v3, s1
313+ ; CHECK-NEXT: v_mov_b32_e32 v2, s0
314+ ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
318315; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
319- ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
320- ; CHECK-NEXT: v_mov_b32_e32 v1, v0
316+ ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
317+ ; CHECK-NEXT: v_readfirstlane_b32 s0, v0
318+ ; CHECK-NEXT: s_mov_b32 s1, s0
321319; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
322- ; CHECK-NEXT: s_setpc_b64 s[30:31]
320+ ; CHECK-NEXT: ; return to shader part epilog
323321 %pair = call {i64 , i1 } @llvm.uadd.with.overflow.i64 (i64 %val0 , i64 1 )
324322 %val = extractvalue {i64 , i1 } %pair , 0
325323 %obit = extractvalue {i64 , i1 } %pair , 1
@@ -328,21 +326,21 @@ define i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
328326 ret i64 %res
329327}
330328
331- define i64 @s_uadd_n1 (i64 inreg %val0 , i64 inreg %val1 , ptr %ptrval ) {
329+ define amdgpu_ps i64 @s_uadd_n1 (i64 inreg %val0 , i64 inreg %val1 , ptr %ptrval ) {
332330; CHECK-LABEL: s_uadd_n1:
333331; CHECK: ; %bb.0:
334- ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335- ; CHECK-NEXT: s_add_u32 s4, s16, -1
336- ; CHECK-NEXT: s_addc_u32 s5, s17, -1
337- ; CHECK-NEXT: s_cmp_lg_u64 s[16:17], 0
338- ; CHECK-NEXT: v_mov_b32_e32 v2, s4
339- ; CHECK-NEXT: v_mov_b32_e32 v3, s5
340- ; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
332+ ; CHECK-NEXT: s_add_u32 s2, s0, -1
333+ ; CHECK-NEXT: s_addc_u32 s3, s1, -1
334+ ; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
335+ ; CHECK-NEXT: v_mov_b32_e32 v2, s2
336+ ; CHECK-NEXT: v_mov_b32_e32 v3, s3
337+ ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
341338; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
342- ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
343- ; CHECK-NEXT: v_mov_b32_e32 v1, v0
339+ ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
340+ ; CHECK-NEXT: v_readfirstlane_b32 s0, v0
341+ ; CHECK-NEXT: s_mov_b32 s1, s0
344342; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
345- ; CHECK-NEXT: s_setpc_b64 s[30:31]
343+ ; CHECK-NEXT: ; return to shader part epilog
346344 %pair = call {i64 , i1 } @llvm.uadd.with.overflow.i64 (i64 %val0 , i64 -1 )
347345 %val = extractvalue {i64 , i1 } %pair , 0
348346 %obit = extractvalue {i64 , i1 } %pair , 1
@@ -351,22 +349,22 @@ define i64 @s_uadd_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
351349 ret i64 %res
352350}
353351
354- define i64 @s_usub_p1 (i64 inreg %val0 , i64 inreg %val1 , ptr %ptrval ) {
352+ define amdgpu_ps i64 @s_usub_p1 (i64 inreg %val0 , i64 inreg %val1 , ptr %ptrval ) {
355353; CHECK-LABEL: s_usub_p1:
356354; CHECK: ; %bb.0:
357- ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
358- ; CHECK-NEXT: s_add_u32 s4, s16, -1
359- ; CHECK-NEXT: v_mov_b32_e32 v2, s16
360- ; CHECK-NEXT: s_addc_u32 s5, s17, -1
361- ; CHECK-NEXT: v_mov_b32_e32 v3, s17
362- ; CHECK-NEXT: v_mov_b32_e32 v4, s4
363- ; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3]
364- ; CHECK-NEXT: v_mov_b32_e32 v5, s5
355+ ; CHECK-NEXT: s_add_u32 s2, s0, -1
356+ ; CHECK-NEXT: v_mov_b32_e32 v3, s1
357+ ; CHECK-NEXT: s_addc_u32 s3, s1, -1
358+ ; CHECK-NEXT: v_mov_b32_e32 v2, s0
359+ ; CHECK-NEXT: v_mov_b32_e32 v5, s3
360+ ; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
361+ ; CHECK-NEXT: v_mov_b32_e32 v4, s2
365362; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
366363; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
367- ; CHECK-NEXT: v_mov_b32_e32 v1, v0
364+ ; CHECK-NEXT: v_readfirstlane_b32 s0, v0
365+ ; CHECK-NEXT: s_mov_b32 s1, s0
368366; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
369- ; CHECK-NEXT: s_setpc_b64 s[30:31]
367+ ; CHECK-NEXT: ; return to shader part epilog
370368 %pair = call {i64 , i1 } @llvm.usub.with.overflow.i64 (i64 %val0 , i64 1 )
371369 %val = extractvalue {i64 , i1 } %pair , 0
372370 %obit = extractvalue {i64 , i1 } %pair , 1
@@ -375,22 +373,22 @@ define i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
375373 ret i64 %res
376374}
377375
378- define i64 @s_usub_n1 (i64 inreg %val0 , i64 inreg %val1 , ptr %ptrval ) {
376+ define amdgpu_ps i64 @s_usub_n1 (i64 inreg %val0 , i64 inreg %val1 , ptr %ptrval ) {
379377; CHECK-LABEL: s_usub_n1:
380378; CHECK: ; %bb.0:
381- ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
382- ; CHECK-NEXT: s_add_u32 s4, s16, 1
383- ; CHECK-NEXT: v_mov_b32_e32 v2, s16
384- ; CHECK-NEXT: s_addc_u32 s5, s17, 0
385- ; CHECK-NEXT: v_mov_b32_e32 v3, s17
386- ; CHECK-NEXT: v_mov_b32_e32 v4, s4
387- ; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3]
388- ; CHECK-NEXT: v_mov_b32_e32 v5, s5
379+ ; CHECK-NEXT: s_add_u32 s2, s0, 1
380+ ; CHECK-NEXT: v_mov_b32_e32 v3, s1
381+ ; CHECK-NEXT: s_addc_u32 s3, s1, 0
382+ ; CHECK-NEXT: v_mov_b32_e32 v2, s0
383+ ; CHECK-NEXT: v_mov_b32_e32 v5, s3
384+ ; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
385+ ; CHECK-NEXT: v_mov_b32_e32 v4, s2
389386; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
390387; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
391- ; CHECK-NEXT: v_mov_b32_e32 v1, v0
388+ ; CHECK-NEXT: v_readfirstlane_b32 s0, v0
389+ ; CHECK-NEXT: s_mov_b32 s1, s0
392390; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
393- ; CHECK-NEXT: s_setpc_b64 s[30:31]
391+ ; CHECK-NEXT: ; return to shader part epilog
394392 %pair = call {i64 , i1 } @llvm.usub.with.overflow.i64 (i64 %val0 , i64 -1 )
395393 %val = extractvalue {i64 , i1 } %pair , 0
396394 %obit = extractvalue {i64 , i1 } %pair , 1
0 commit comments