Skip to content

Commit a2d07e2

Browse files
committed
When testing SGPR return result in SGPR
Signed-off-by: John Lu <[email protected]>
1 parent 4fbecc2 commit a2d07e2

File tree

1 file changed

+108
-110
lines changed

1 file changed

+108
-110
lines changed

llvm/test/CodeGen/AMDGPU/addsub64_carry.ll

Lines changed: 108 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
;; Test that carryout from 64-bit add/sub (synthesized from two 32-bit adds/subs) is utilized
33
;; (i.e. no additional compare is generated).
44

5-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
5+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
66

77
%0 = type { i64, i64, i32, i32 }
88
%1 = type { [64 x [8 x i64]] }
@@ -189,21 +189,19 @@ define i64 @v_usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
189189
; test SGPR
190190
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
191191

192-
define %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B, i32 inreg %val32) {
192+
define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B, i32 inreg %val32) {
193193
; CHECK-LABEL: s_add64_32:
194194
; CHECK: ; %bb.0:
195-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196-
; CHECK-NEXT: s_add_u32 s4, s16, s18
197-
; CHECK-NEXT: v_mov_b32_e32 v0, s16
198-
; CHECK-NEXT: s_addc_u32 s5, s17, s19
199-
; CHECK-NEXT: v_mov_b32_e32 v1, s17
200-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
201-
; CHECK-NEXT: v_mov_b32_e32 v0, s4
195+
; CHECK-NEXT: s_add_u32 s6, s0, s2
196+
; CHECK-NEXT: v_mov_b32_e32 v0, s0
197+
; CHECK-NEXT: s_addc_u32 s7, s1, s3
198+
; CHECK-NEXT: v_mov_b32_e32 v1, s1
199+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
200+
; CHECK-NEXT: s_mov_b32 s0, s6
202201
; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
203-
; CHECK-NEXT: s_addc_u32 s6, s20, 0
204-
; CHECK-NEXT: v_mov_b32_e32 v1, s5
205-
; CHECK-NEXT: v_mov_b32_e32 v2, s6
206-
; CHECK-NEXT: s_setpc_b64 s[30:31]
202+
; CHECK-NEXT: s_addc_u32 s2, s4, 0
203+
; CHECK-NEXT: s_mov_b32 s1, s7
204+
; CHECK-NEXT: ; return to shader part epilog
207205
%sum64 = add i64 %val64A, %val64B
208206
%obit = icmp ult i64 %sum64, %val64A
209207
%obit32 = zext i1 %obit to i32
@@ -213,32 +211,32 @@ define %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B, i32 inre
213211
ret %struct.uint96 %.fca.1.insert
214212
}
215213

216-
define <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
214+
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
217215
; CHECK-LABEL: s_uadd_v2i64:
218216
; CHECK: ; %bb.0:
219-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220-
; CHECK-NEXT: s_add_u32 s4, s18, s22
221-
; CHECK-NEXT: s_addc_u32 s5, s19, s23
222-
; CHECK-NEXT: s_add_u32 s6, s16, s20
223-
; CHECK-NEXT: v_mov_b32_e32 v2, s16
224-
; CHECK-NEXT: s_addc_u32 s7, s17, s21
225-
; CHECK-NEXT: v_mov_b32_e32 v3, s17
226-
; CHECK-NEXT: v_mov_b32_e32 v8, s18
227-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
228-
; CHECK-NEXT: v_mov_b32_e32 v9, s19
229-
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
230-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
217+
; CHECK-NEXT: s_add_u32 s6, s2, s6
218+
; CHECK-NEXT: v_mov_b32_e32 v9, s3
219+
; CHECK-NEXT: s_addc_u32 s7, s3, s7
220+
; CHECK-NEXT: v_mov_b32_e32 v8, s2
221+
; CHECK-NEXT: s_add_u32 s4, s0, s4
222+
; CHECK-NEXT: v_mov_b32_e32 v7, s1
223+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
224+
; CHECK-NEXT: s_addc_u32 s5, s1, s5
225+
; CHECK-NEXT: v_mov_b32_e32 v6, s0
226+
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
227+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
228+
; CHECK-NEXT: v_readfirstlane_b32 s2, v8
229+
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
230+
; CHECK-NEXT: v_readfirstlane_b32 s0, v6
231+
; CHECK-NEXT: v_mov_b32_e32 v2, s4
232+
; CHECK-NEXT: v_mov_b32_e32 v3, s5
231233
; CHECK-NEXT: v_mov_b32_e32 v4, s6
232234
; CHECK-NEXT: v_mov_b32_e32 v5, s7
233-
; CHECK-NEXT: v_mov_b32_e32 v6, s4
234-
; CHECK-NEXT: v_mov_b32_e32 v7, s5
235-
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
236-
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
237-
; CHECK-NEXT: v_mov_b32_e32 v0, v2
238-
; CHECK-NEXT: v_mov_b32_e32 v1, v2
239-
; CHECK-NEXT: v_mov_b32_e32 v2, v3
235+
; CHECK-NEXT: s_mov_b32 s1, s0
236+
; CHECK-NEXT: s_mov_b32 s3, s2
237+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
240238
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
241-
; CHECK-NEXT: s_setpc_b64 s[30:31]
239+
; CHECK-NEXT: ; return to shader part epilog
242240
%pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
243241
%val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
244242
%obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
@@ -247,32 +245,32 @@ define <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr
247245
ret <2 x i64> %res
248246
}
249247

250-
define <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
248+
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
251249
; CHECK-LABEL: s_usub_v2i64:
252250
; CHECK: ; %bb.0:
253-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254-
; CHECK-NEXT: s_sub_u32 s4, s18, s22
255-
; CHECK-NEXT: s_subb_u32 s5, s19, s23
256-
; CHECK-NEXT: s_sub_u32 s6, s16, s20
257-
; CHECK-NEXT: v_mov_b32_e32 v2, s16
258-
; CHECK-NEXT: s_subb_u32 s7, s17, s21
259-
; CHECK-NEXT: v_mov_b32_e32 v3, s17
260-
; CHECK-NEXT: v_mov_b32_e32 v8, s18
261-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
262-
; CHECK-NEXT: v_mov_b32_e32 v9, s19
263-
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
264-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[8:9]
251+
; CHECK-NEXT: s_sub_u32 s6, s2, s6
252+
; CHECK-NEXT: v_mov_b32_e32 v9, s3
253+
; CHECK-NEXT: s_subb_u32 s7, s3, s7
254+
; CHECK-NEXT: v_mov_b32_e32 v8, s2
255+
; CHECK-NEXT: s_sub_u32 s4, s0, s4
256+
; CHECK-NEXT: v_mov_b32_e32 v7, s1
257+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[8:9]
258+
; CHECK-NEXT: s_subb_u32 s5, s1, s5
259+
; CHECK-NEXT: v_mov_b32_e32 v6, s0
260+
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
261+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[6:7]
262+
; CHECK-NEXT: v_readfirstlane_b32 s2, v8
263+
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
264+
; CHECK-NEXT: v_readfirstlane_b32 s0, v6
265+
; CHECK-NEXT: v_mov_b32_e32 v2, s4
266+
; CHECK-NEXT: v_mov_b32_e32 v3, s5
265267
; CHECK-NEXT: v_mov_b32_e32 v4, s6
266268
; CHECK-NEXT: v_mov_b32_e32 v5, s7
267-
; CHECK-NEXT: v_mov_b32_e32 v6, s4
268-
; CHECK-NEXT: v_mov_b32_e32 v7, s5
269-
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
270-
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
271-
; CHECK-NEXT: v_mov_b32_e32 v0, v2
272-
; CHECK-NEXT: v_mov_b32_e32 v1, v2
273-
; CHECK-NEXT: v_mov_b32_e32 v2, v3
269+
; CHECK-NEXT: s_mov_b32 s1, s0
270+
; CHECK-NEXT: s_mov_b32 s3, s2
271+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
274272
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
275-
; CHECK-NEXT: s_setpc_b64 s[30:31]
273+
; CHECK-NEXT: ; return to shader part epilog
276274
%pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
277275
%val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
278276
%obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
@@ -281,22 +279,22 @@ define <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr
281279
ret <2 x i64> %res
282280
}
283281

284-
define i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
282+
define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
285283
; CHECK-LABEL: s_uadd_i64:
286284
; CHECK: ; %bb.0:
287-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288-
; CHECK-NEXT: s_add_u32 s4, s16, s18
289-
; CHECK-NEXT: v_mov_b32_e32 v2, s16
290-
; CHECK-NEXT: s_addc_u32 s5, s17, s19
291-
; CHECK-NEXT: v_mov_b32_e32 v3, s17
292-
; CHECK-NEXT: v_mov_b32_e32 v4, s4
293-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
294-
; CHECK-NEXT: v_mov_b32_e32 v5, s5
285+
; CHECK-NEXT: s_add_u32 s2, s0, s2
286+
; CHECK-NEXT: v_mov_b32_e32 v3, s1
287+
; CHECK-NEXT: s_addc_u32 s3, s1, s3
288+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
289+
; CHECK-NEXT: v_mov_b32_e32 v5, s3
290+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
291+
; CHECK-NEXT: v_mov_b32_e32 v4, s2
295292
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
296293
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
297-
; CHECK-NEXT: v_mov_b32_e32 v1, v0
294+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
295+
; CHECK-NEXT: s_mov_b32 s1, s0
298296
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
299-
; CHECK-NEXT: s_setpc_b64 s[30:31]
297+
; CHECK-NEXT: ; return to shader part epilog
300298
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 %val1)
301299
%val = extractvalue {i64, i1} %pair, 0
302300
%obit = extractvalue {i64, i1} %pair, 1
@@ -305,21 +303,21 @@ define i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
305303
ret i64 %res
306304
}
307305

308-
define i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
306+
define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
309307
; CHECK-LABEL: s_uadd_p1:
310308
; CHECK: ; %bb.0:
311-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312-
; CHECK-NEXT: s_add_u32 s4, s16, 1
313-
; CHECK-NEXT: s_addc_u32 s5, s17, 0
314-
; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0
315-
; CHECK-NEXT: v_mov_b32_e32 v2, s4
316-
; CHECK-NEXT: v_mov_b32_e32 v3, s5
317-
; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
309+
; CHECK-NEXT: s_add_u32 s0, s0, 1
310+
; CHECK-NEXT: s_addc_u32 s1, s1, 0
311+
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
312+
; CHECK-NEXT: v_mov_b32_e32 v3, s1
313+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
314+
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
318315
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
319-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
320-
; CHECK-NEXT: v_mov_b32_e32 v1, v0
316+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
317+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
318+
; CHECK-NEXT: s_mov_b32 s1, s0
321319
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
322-
; CHECK-NEXT: s_setpc_b64 s[30:31]
320+
; CHECK-NEXT: ; return to shader part epilog
323321
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 1)
324322
%val = extractvalue {i64, i1} %pair, 0
325323
%obit = extractvalue {i64, i1} %pair, 1
@@ -328,21 +326,21 @@ define i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
328326
ret i64 %res
329327
}
330328

331-
define i64 @s_uadd_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
329+
define amdgpu_ps i64 @s_uadd_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
332330
; CHECK-LABEL: s_uadd_n1:
333331
; CHECK: ; %bb.0:
334-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335-
; CHECK-NEXT: s_add_u32 s4, s16, -1
336-
; CHECK-NEXT: s_addc_u32 s5, s17, -1
337-
; CHECK-NEXT: s_cmp_lg_u64 s[16:17], 0
338-
; CHECK-NEXT: v_mov_b32_e32 v2, s4
339-
; CHECK-NEXT: v_mov_b32_e32 v3, s5
340-
; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
332+
; CHECK-NEXT: s_add_u32 s2, s0, -1
333+
; CHECK-NEXT: s_addc_u32 s3, s1, -1
334+
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
335+
; CHECK-NEXT: v_mov_b32_e32 v2, s2
336+
; CHECK-NEXT: v_mov_b32_e32 v3, s3
337+
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
341338
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
342-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
343-
; CHECK-NEXT: v_mov_b32_e32 v1, v0
339+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
340+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
341+
; CHECK-NEXT: s_mov_b32 s1, s0
344342
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
345-
; CHECK-NEXT: s_setpc_b64 s[30:31]
343+
; CHECK-NEXT: ; return to shader part epilog
346344
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 -1)
347345
%val = extractvalue {i64, i1} %pair, 0
348346
%obit = extractvalue {i64, i1} %pair, 1
@@ -351,22 +349,22 @@ define i64 @s_uadd_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
351349
ret i64 %res
352350
}
353351

354-
define i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
352+
define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
355353
; CHECK-LABEL: s_usub_p1:
356354
; CHECK: ; %bb.0:
357-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
358-
; CHECK-NEXT: s_add_u32 s4, s16, -1
359-
; CHECK-NEXT: v_mov_b32_e32 v2, s16
360-
; CHECK-NEXT: s_addc_u32 s5, s17, -1
361-
; CHECK-NEXT: v_mov_b32_e32 v3, s17
362-
; CHECK-NEXT: v_mov_b32_e32 v4, s4
363-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3]
364-
; CHECK-NEXT: v_mov_b32_e32 v5, s5
355+
; CHECK-NEXT: s_add_u32 s2, s0, -1
356+
; CHECK-NEXT: v_mov_b32_e32 v3, s1
357+
; CHECK-NEXT: s_addc_u32 s3, s1, -1
358+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
359+
; CHECK-NEXT: v_mov_b32_e32 v5, s3
360+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
361+
; CHECK-NEXT: v_mov_b32_e32 v4, s2
365362
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
366363
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
367-
; CHECK-NEXT: v_mov_b32_e32 v1, v0
364+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
365+
; CHECK-NEXT: s_mov_b32 s1, s0
368366
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
369-
; CHECK-NEXT: s_setpc_b64 s[30:31]
367+
; CHECK-NEXT: ; return to shader part epilog
370368
%pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 1)
371369
%val = extractvalue {i64, i1} %pair, 0
372370
%obit = extractvalue {i64, i1} %pair, 1
@@ -375,22 +373,22 @@ define i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
375373
ret i64 %res
376374
}
377375

378-
define i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
376+
define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
379377
; CHECK-LABEL: s_usub_n1:
380378
; CHECK: ; %bb.0:
381-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
382-
; CHECK-NEXT: s_add_u32 s4, s16, 1
383-
; CHECK-NEXT: v_mov_b32_e32 v2, s16
384-
; CHECK-NEXT: s_addc_u32 s5, s17, 0
385-
; CHECK-NEXT: v_mov_b32_e32 v3, s17
386-
; CHECK-NEXT: v_mov_b32_e32 v4, s4
387-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3]
388-
; CHECK-NEXT: v_mov_b32_e32 v5, s5
379+
; CHECK-NEXT: s_add_u32 s2, s0, 1
380+
; CHECK-NEXT: v_mov_b32_e32 v3, s1
381+
; CHECK-NEXT: s_addc_u32 s3, s1, 0
382+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
383+
; CHECK-NEXT: v_mov_b32_e32 v5, s3
384+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
385+
; CHECK-NEXT: v_mov_b32_e32 v4, s2
389386
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
390387
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
391-
; CHECK-NEXT: v_mov_b32_e32 v1, v0
388+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
389+
; CHECK-NEXT: s_mov_b32 s1, s0
392390
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
393-
; CHECK-NEXT: s_setpc_b64 s[30:31]
391+
; CHECK-NEXT: ; return to shader part epilog
394392
%pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)
395393
%val = extractvalue {i64, i1} %pair, 0
396394
%obit = extractvalue {i64, i1} %pair, 1

0 commit comments

Comments
 (0)