Skip to content

Commit 3e10bdd

Browse files
authored
[NFC][AMDGPU] Pre-commit test for setcc removal by using add/sub carryout (#155118)
Pre-commit test for setcc removal by using add/sub carryout. --------- Signed-off-by: John Lu <[email protected]>
1 parent 9bb860e commit 3e10bdd

File tree

1 file changed

+396
-0
lines changed

1 file changed

+396
-0
lines changed
Lines changed: 396 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,396 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
;; Test that carryout from 64-bit add/sub (synthesized from two 32-bit adds/subs) is utilized
3+
;; (i.e. no additional compare is generated).
4+
5+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
6+
7+
%struct.uint96 = type { i64, i32 }
8+
%struct.uint64pair = type { i64, i64 }
9+
10+
declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64)
11+
declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)
12+
13+
declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
14+
declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
15+
16+
define %struct.uint96 @v_add64_32(i64 %val64A, i64 %val64B, i32 %val32) {
17+
; CHECK-LABEL: v_add64_32:
18+
; CHECK: ; %bb.0:
19+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20+
; CHECK-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
21+
; CHECK-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v3, vcc
22+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[0:1]
23+
; CHECK-NEXT: v_mov_b32_e32 v0, v5
24+
; CHECK-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
25+
; CHECK-NEXT: v_mov_b32_e32 v1, v6
26+
; CHECK-NEXT: s_setpc_b64 s[30:31]
27+
%sum64 = add i64 %val64A, %val64B
28+
%obit = icmp ult i64 %sum64, %val64A
29+
%obit32 = zext i1 %obit to i32
30+
%sum32 = add i32 %val32, %obit32
31+
%.fca.0.insert = insertvalue %struct.uint96 poison, i64 %sum64, 0
32+
%.fca.1.insert = insertvalue %struct.uint96 %.fca.0.insert, i32 %sum32, 1
33+
ret %struct.uint96 %.fca.1.insert
34+
}
35+
36+
define <2 x i64> @v_uadd_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
37+
; CHECK-LABEL: v_uadd_v2i64:
38+
; CHECK: ; %bb.0:
39+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40+
; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, v2, v6
41+
; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
42+
; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, v0, v4
43+
; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
44+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
45+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
46+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
47+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
48+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
49+
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
50+
; CHECK-NEXT: v_mov_b32_e32 v3, v2
51+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
52+
; CHECK-NEXT: s_setpc_b64 s[30:31]
53+
%pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
54+
%val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
55+
%obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
56+
%res = sext <2 x i1> %obit to <2 x i64>
57+
store <2 x i64> %val, ptr %ptrval
58+
ret <2 x i64> %res
59+
}
60+
61+
define <2 x i64> @v_usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
62+
; CHECK-LABEL: v_usub_v2i64:
63+
; CHECK: ; %bb.0:
64+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65+
; CHECK-NEXT: v_sub_co_u32_e32 v6, vcc, v2, v6
66+
; CHECK-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
67+
; CHECK-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v4
68+
; CHECK-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
69+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[0:1]
70+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
71+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
72+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
73+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
74+
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
75+
; CHECK-NEXT: v_mov_b32_e32 v3, v2
76+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
77+
; CHECK-NEXT: s_setpc_b64 s[30:31]
78+
%pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
79+
%val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
80+
%obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
81+
%res = sext <2 x i1> %obit to <2 x i64>
82+
store <2 x i64> %val, ptr %ptrval
83+
ret <2 x i64> %res
84+
}
85+
86+
define i64 @v_uadd_i64(i64 %val0, i64 %val1, ptr %ptrval) {
87+
; CHECK-LABEL: v_uadd_i64:
88+
; CHECK: ; %bb.0:
89+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90+
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
91+
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
92+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
93+
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
94+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
95+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
96+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
97+
; CHECK-NEXT: s_setpc_b64 s[30:31]
98+
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 %val1)
99+
%val = extractvalue {i64, i1} %pair, 0
100+
%obit = extractvalue {i64, i1} %pair, 1
101+
%res = sext i1 %obit to i64
102+
store i64 %val, ptr %ptrval
103+
ret i64 %res
104+
}
105+
106+
define i64 @v_uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {
107+
; CHECK-LABEL: v_uadd_p1:
108+
; CHECK: ; %bb.0:
109+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110+
; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
111+
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
112+
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
113+
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
114+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
115+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
116+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
117+
; CHECK-NEXT: s_setpc_b64 s[30:31]
118+
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 1)
119+
%val = extractvalue {i64, i1} %pair, 0
120+
%obit = extractvalue {i64, i1} %pair, 1
121+
%res = sext i1 %obit to i64
122+
store i64 %val, ptr %ptrval
123+
ret i64 %res
124+
}
125+
126+
define i64 @v_uadd_n1(i64 %val0, i64 %val1, ptr %ptrval) {
127+
; CHECK-LABEL: v_uadd_n1:
128+
; CHECK: ; %bb.0:
129+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130+
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
131+
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
132+
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
133+
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
134+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
135+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
136+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
137+
; CHECK-NEXT: s_setpc_b64 s[30:31]
138+
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 -1)
139+
%val = extractvalue {i64, i1} %pair, 0
140+
%obit = extractvalue {i64, i1} %pair, 1
141+
%res = sext i1 %obit to i64
142+
store i64 %val, ptr %ptrval
143+
ret i64 %res
144+
}
145+
146+
define i64 @v_usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
147+
; CHECK-LABEL: v_usub_p1:
148+
; CHECK: ; %bb.0:
149+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150+
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
151+
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
152+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
153+
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
154+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
155+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
156+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
157+
; CHECK-NEXT: s_setpc_b64 s[30:31]
158+
%pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 1)
159+
%val = extractvalue {i64, i1} %pair, 0
160+
%obit = extractvalue {i64, i1} %pair, 1
161+
%res = sext i1 %obit to i64
162+
store i64 %val, ptr %ptrval
163+
ret i64 %res
164+
}
165+
166+
define i64 @v_usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
167+
; CHECK-LABEL: v_usub_n1:
168+
; CHECK: ; %bb.0:
169+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
170+
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0
171+
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
172+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
173+
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
174+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
175+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
176+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
177+
; CHECK-NEXT: s_setpc_b64 s[30:31]
178+
%pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)
179+
%val = extractvalue {i64, i1} %pair, 0
180+
%obit = extractvalue {i64, i1} %pair, 1
181+
%res = sext i1 %obit to i64
182+
store i64 %val, ptr %ptrval
183+
ret i64 %res
184+
}
185+
186+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
187+
; test SGPR
188+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
189+
190+
define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B, i32 inreg %val32) {
191+
; CHECK-LABEL: s_add64_32:
192+
; CHECK: ; %bb.0:
193+
; CHECK-NEXT: s_add_u32 s6, s0, s2
194+
; CHECK-NEXT: v_mov_b32_e32 v0, s0
195+
; CHECK-NEXT: s_addc_u32 s7, s1, s3
196+
; CHECK-NEXT: v_mov_b32_e32 v1, s1
197+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
198+
; CHECK-NEXT: s_mov_b32 s0, s6
199+
; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
200+
; CHECK-NEXT: s_addc_u32 s2, s4, 0
201+
; CHECK-NEXT: s_mov_b32 s1, s7
202+
; CHECK-NEXT: ; return to shader part epilog
203+
%sum64 = add i64 %val64A, %val64B
204+
%obit = icmp ult i64 %sum64, %val64A
205+
%obit32 = zext i1 %obit to i32
206+
%sum32 = add i32 %val32, %obit32
207+
%.fca.0.insert = insertvalue %struct.uint96 poison, i64 %sum64, 0
208+
%.fca.1.insert = insertvalue %struct.uint96 %.fca.0.insert, i32 %sum32, 1
209+
ret %struct.uint96 %.fca.1.insert
210+
}
211+
212+
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
213+
; CHECK-LABEL: s_uadd_v2i64:
214+
; CHECK: ; %bb.0:
215+
; CHECK-NEXT: s_add_u32 s6, s2, s6
216+
; CHECK-NEXT: v_mov_b32_e32 v9, s3
217+
; CHECK-NEXT: s_addc_u32 s7, s3, s7
218+
; CHECK-NEXT: v_mov_b32_e32 v8, s2
219+
; CHECK-NEXT: s_add_u32 s4, s0, s4
220+
; CHECK-NEXT: v_mov_b32_e32 v7, s1
221+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
222+
; CHECK-NEXT: s_addc_u32 s5, s1, s5
223+
; CHECK-NEXT: v_mov_b32_e32 v6, s0
224+
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
225+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
226+
; CHECK-NEXT: v_readfirstlane_b32 s2, v8
227+
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
228+
; CHECK-NEXT: v_readfirstlane_b32 s0, v6
229+
; CHECK-NEXT: v_mov_b32_e32 v2, s4
230+
; CHECK-NEXT: v_mov_b32_e32 v3, s5
231+
; CHECK-NEXT: v_mov_b32_e32 v4, s6
232+
; CHECK-NEXT: v_mov_b32_e32 v5, s7
233+
; CHECK-NEXT: s_mov_b32 s1, s0
234+
; CHECK-NEXT: s_mov_b32 s3, s2
235+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
236+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
237+
; CHECK-NEXT: ; return to shader part epilog
238+
%pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
239+
%val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
240+
%obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
241+
%res = sext <2 x i1> %obit to <2 x i64>
242+
store <2 x i64> %val, ptr %ptrval
243+
ret <2 x i64> %res
244+
}
245+
246+
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
247+
; CHECK-LABEL: s_usub_v2i64:
248+
; CHECK: ; %bb.0:
249+
; CHECK-NEXT: s_sub_u32 s6, s2, s6
250+
; CHECK-NEXT: v_mov_b32_e32 v9, s3
251+
; CHECK-NEXT: s_subb_u32 s7, s3, s7
252+
; CHECK-NEXT: v_mov_b32_e32 v8, s2
253+
; CHECK-NEXT: s_sub_u32 s4, s0, s4
254+
; CHECK-NEXT: v_mov_b32_e32 v7, s1
255+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[8:9]
256+
; CHECK-NEXT: s_subb_u32 s5, s1, s5
257+
; CHECK-NEXT: v_mov_b32_e32 v6, s0
258+
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
259+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[6:7]
260+
; CHECK-NEXT: v_readfirstlane_b32 s2, v8
261+
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
262+
; CHECK-NEXT: v_readfirstlane_b32 s0, v6
263+
; CHECK-NEXT: v_mov_b32_e32 v2, s4
264+
; CHECK-NEXT: v_mov_b32_e32 v3, s5
265+
; CHECK-NEXT: v_mov_b32_e32 v4, s6
266+
; CHECK-NEXT: v_mov_b32_e32 v5, s7
267+
; CHECK-NEXT: s_mov_b32 s1, s0
268+
; CHECK-NEXT: s_mov_b32 s3, s2
269+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
270+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
271+
; CHECK-NEXT: ; return to shader part epilog
272+
%pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
273+
%val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
274+
%obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
275+
%res = sext <2 x i1> %obit to <2 x i64>
276+
store <2 x i64> %val, ptr %ptrval
277+
ret <2 x i64> %res
278+
}
279+
280+
define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
281+
; CHECK-LABEL: s_uadd_i64:
282+
; CHECK: ; %bb.0:
283+
; CHECK-NEXT: s_add_u32 s2, s0, s2
284+
; CHECK-NEXT: v_mov_b32_e32 v3, s1
285+
; CHECK-NEXT: s_addc_u32 s3, s1, s3
286+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
287+
; CHECK-NEXT: v_mov_b32_e32 v5, s3
288+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
289+
; CHECK-NEXT: v_mov_b32_e32 v4, s2
290+
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
291+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
292+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
293+
; CHECK-NEXT: s_mov_b32 s1, s0
294+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
295+
; CHECK-NEXT: ; return to shader part epilog
296+
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 %val1)
297+
%val = extractvalue {i64, i1} %pair, 0
298+
%obit = extractvalue {i64, i1} %pair, 1
299+
%res = sext i1 %obit to i64
300+
store i64 %val, ptr %ptrval
301+
ret i64 %res
302+
}
303+
304+
define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
305+
; CHECK-LABEL: s_uadd_p1:
306+
; CHECK: ; %bb.0:
307+
; CHECK-NEXT: s_add_u32 s0, s0, 1
308+
; CHECK-NEXT: s_addc_u32 s1, s1, 0
309+
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
310+
; CHECK-NEXT: v_mov_b32_e32 v3, s1
311+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
312+
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
313+
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
314+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
315+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
316+
; CHECK-NEXT: s_mov_b32 s1, s0
317+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
318+
; CHECK-NEXT: ; return to shader part epilog
319+
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 1)
320+
%val = extractvalue {i64, i1} %pair, 0
321+
%obit = extractvalue {i64, i1} %pair, 1
322+
%res = sext i1 %obit to i64
323+
store i64 %val, ptr %ptrval
324+
ret i64 %res
325+
}
326+
327+
define amdgpu_ps i64 @s_uadd_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
328+
; CHECK-LABEL: s_uadd_n1:
329+
; CHECK: ; %bb.0:
330+
; CHECK-NEXT: s_add_u32 s2, s0, -1
331+
; CHECK-NEXT: s_addc_u32 s3, s1, -1
332+
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
333+
; CHECK-NEXT: v_mov_b32_e32 v2, s2
334+
; CHECK-NEXT: v_mov_b32_e32 v3, s3
335+
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
336+
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
337+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
338+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
339+
; CHECK-NEXT: s_mov_b32 s1, s0
340+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
341+
; CHECK-NEXT: ; return to shader part epilog
342+
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 -1)
343+
%val = extractvalue {i64, i1} %pair, 0
344+
%obit = extractvalue {i64, i1} %pair, 1
345+
%res = sext i1 %obit to i64
346+
store i64 %val, ptr %ptrval
347+
ret i64 %res
348+
}
349+
350+
define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
351+
; CHECK-LABEL: s_usub_p1:
352+
; CHECK: ; %bb.0:
353+
; CHECK-NEXT: s_add_u32 s2, s0, -1
354+
; CHECK-NEXT: v_mov_b32_e32 v3, s1
355+
; CHECK-NEXT: s_addc_u32 s3, s1, -1
356+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
357+
; CHECK-NEXT: v_mov_b32_e32 v5, s3
358+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
359+
; CHECK-NEXT: v_mov_b32_e32 v4, s2
360+
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
361+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
362+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
363+
; CHECK-NEXT: s_mov_b32 s1, s0
364+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
365+
; CHECK-NEXT: ; return to shader part epilog
366+
%pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 1)
367+
%val = extractvalue {i64, i1} %pair, 0
368+
%obit = extractvalue {i64, i1} %pair, 1
369+
%res = sext i1 %obit to i64
370+
store i64 %val, ptr %ptrval
371+
ret i64 %res
372+
}
373+
374+
define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
375+
; CHECK-LABEL: s_usub_n1:
376+
; CHECK: ; %bb.0:
377+
; CHECK-NEXT: s_add_u32 s2, s0, 1
378+
; CHECK-NEXT: v_mov_b32_e32 v3, s1
379+
; CHECK-NEXT: s_addc_u32 s3, s1, 0
380+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
381+
; CHECK-NEXT: v_mov_b32_e32 v5, s3
382+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
383+
; CHECK-NEXT: v_mov_b32_e32 v4, s2
384+
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
385+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
386+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
387+
; CHECK-NEXT: s_mov_b32 s1, s0
388+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
389+
; CHECK-NEXT: ; return to shader part epilog
390+
%pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)
391+
%val = extractvalue {i64, i1} %pair, 0
392+
%obit = extractvalue {i64, i1} %pair, 1
393+
%res = sext i1 %obit to i64
394+
store i64 %val, ptr %ptrval
395+
ret i64 %res
396+
}

0 commit comments

Comments
 (0)