Skip to content

Commit 4fbecc2

Browse files
committed
Address feedback. Add SGPR variants.
Signed-off-by: John Lu <[email protected]>
1 parent 939a425 commit 4fbecc2

File tree

1 file changed

+230
-16
lines changed

1 file changed

+230
-16
lines changed

llvm/test/CodeGen/AMDGPU/addsub64_carry.ll

Lines changed: 230 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)
1515
declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
1616
declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
1717

18-
define hidden %struct.uint96 @add64_32(i64 %val64A, i64 %val64B, i32 %val32) {
19-
; CHECK-LABEL: add64_32:
18+
define %struct.uint96 @v_add64_32(i64 %val64A, i64 %val64B, i32 %val32) {
19+
; CHECK-LABEL: v_add64_32:
2020
; CHECK: ; %bb.0:
2121
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2222
; CHECK-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
@@ -35,8 +35,8 @@ define hidden %struct.uint96 @add64_32(i64 %val64A, i64 %val64B, i32 %val32) {
3535
ret %struct.uint96 %.fca.1.insert
3636
}
3737

38-
define <2 x i64> @uadd_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
39-
; CHECK-LABEL: uadd_v2i64:
38+
define <2 x i64> @v_uadd_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
39+
; CHECK-LABEL: v_uadd_v2i64:
4040
; CHECK: ; %bb.0:
4141
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4242
; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, v2, v6
@@ -60,8 +60,8 @@ define <2 x i64> @uadd_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
6060
ret <2 x i64> %res
6161
}
6262

63-
define <2 x i64> @usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
64-
; CHECK-LABEL: usub_v2i64:
63+
define <2 x i64> @v_usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
64+
; CHECK-LABEL: v_usub_v2i64:
6565
; CHECK: ; %bb.0:
6666
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6767
; CHECK-NEXT: v_sub_co_u32_e32 v6, vcc, v2, v6
@@ -85,8 +85,8 @@ define <2 x i64> @usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
8585
ret <2 x i64> %res
8686
}
8787

88-
define i64 @uadd_i64(i64 %val0, i64 %val1, ptr %ptrval) {
89-
; CHECK-LABEL: uadd_i64:
88+
define i64 @v_uadd_i64(i64 %val0, i64 %val1, ptr %ptrval) {
89+
; CHECK-LABEL: v_uadd_i64:
9090
; CHECK: ; %bb.0:
9191
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9292
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
@@ -105,8 +105,8 @@ define i64 @uadd_i64(i64 %val0, i64 %val1, ptr %ptrval) {
105105
ret i64 %res
106106
}
107107

108-
define i64 @uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {
109-
; CHECK-LABEL: uadd_p1:
108+
define i64 @v_uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {
109+
; CHECK-LABEL: v_uadd_p1:
110110
; CHECK: ; %bb.0:
111111
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112112
; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
@@ -125,8 +125,8 @@ define i64 @uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {
125125
ret i64 %res
126126
}
127127

128-
define i64 @uadd_n1(i64 %val0, i64 %val1, ptr %ptrval) {
129-
; CHECK-LABEL: uadd_n1:
128+
define i64 @v_uadd_n1(i64 %val0, i64 %val1, ptr %ptrval) {
129+
; CHECK-LABEL: v_uadd_n1:
130130
; CHECK: ; %bb.0:
131131
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
132132
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
@@ -145,8 +145,8 @@ define i64 @uadd_n1(i64 %val0, i64 %val1, ptr %ptrval) {
145145
ret i64 %res
146146
}
147147

148-
define i64 @usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
149-
; CHECK-LABEL: usub_p1:
148+
define i64 @v_usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
149+
; CHECK-LABEL: v_usub_p1:
150150
; CHECK: ; %bb.0:
151151
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152152
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
@@ -165,8 +165,8 @@ define i64 @usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
165165
ret i64 %res
166166
}
167167

168-
define i64 @usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
169-
; CHECK-LABEL: usub_n1:
168+
define i64 @v_usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
169+
; CHECK-LABEL: v_usub_n1:
170170
; CHECK: ; %bb.0:
171171
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172172
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0
@@ -184,3 +184,217 @@ define i64 @usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
184184
store i64 %val, ptr %ptrval
185185
ret i64 %res
186186
}
187+
188+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
189+
; test SGPR
190+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
191+
192+
define %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B, i32 inreg %val32) {
193+
; CHECK-LABEL: s_add64_32:
194+
; CHECK: ; %bb.0:
195+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196+
; CHECK-NEXT: s_add_u32 s4, s16, s18
197+
; CHECK-NEXT: v_mov_b32_e32 v0, s16
198+
; CHECK-NEXT: s_addc_u32 s5, s17, s19
199+
; CHECK-NEXT: v_mov_b32_e32 v1, s17
200+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
201+
; CHECK-NEXT: v_mov_b32_e32 v0, s4
202+
; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
203+
; CHECK-NEXT: s_addc_u32 s6, s20, 0
204+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
205+
; CHECK-NEXT: v_mov_b32_e32 v2, s6
206+
; CHECK-NEXT: s_setpc_b64 s[30:31]
207+
%sum64 = add i64 %val64A, %val64B
208+
%obit = icmp ult i64 %sum64, %val64A
209+
%obit32 = zext i1 %obit to i32
210+
%sum32 = add i32 %val32, %obit32
211+
%.fca.0.insert = insertvalue %struct.uint96 poison, i64 %sum64, 0
212+
%.fca.1.insert = insertvalue %struct.uint96 %.fca.0.insert, i32 %sum32, 1
213+
ret %struct.uint96 %.fca.1.insert
214+
}
215+
216+
define <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
217+
; CHECK-LABEL: s_uadd_v2i64:
218+
; CHECK: ; %bb.0:
219+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220+
; CHECK-NEXT: s_add_u32 s4, s18, s22
221+
; CHECK-NEXT: s_addc_u32 s5, s19, s23
222+
; CHECK-NEXT: s_add_u32 s6, s16, s20
223+
; CHECK-NEXT: v_mov_b32_e32 v2, s16
224+
; CHECK-NEXT: s_addc_u32 s7, s17, s21
225+
; CHECK-NEXT: v_mov_b32_e32 v3, s17
226+
; CHECK-NEXT: v_mov_b32_e32 v8, s18
227+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
228+
; CHECK-NEXT: v_mov_b32_e32 v9, s19
229+
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
230+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
231+
; CHECK-NEXT: v_mov_b32_e32 v4, s6
232+
; CHECK-NEXT: v_mov_b32_e32 v5, s7
233+
; CHECK-NEXT: v_mov_b32_e32 v6, s4
234+
; CHECK-NEXT: v_mov_b32_e32 v7, s5
235+
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
236+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
237+
; CHECK-NEXT: v_mov_b32_e32 v0, v2
238+
; CHECK-NEXT: v_mov_b32_e32 v1, v2
239+
; CHECK-NEXT: v_mov_b32_e32 v2, v3
240+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
241+
; CHECK-NEXT: s_setpc_b64 s[30:31]
242+
%pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
243+
%val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
244+
%obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
245+
%res = sext <2 x i1> %obit to <2 x i64>
246+
store <2 x i64> %val, ptr %ptrval
247+
ret <2 x i64> %res
248+
}
249+
250+
define <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
251+
; CHECK-LABEL: s_usub_v2i64:
252+
; CHECK: ; %bb.0:
253+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254+
; CHECK-NEXT: s_sub_u32 s4, s18, s22
255+
; CHECK-NEXT: s_subb_u32 s5, s19, s23
256+
; CHECK-NEXT: s_sub_u32 s6, s16, s20
257+
; CHECK-NEXT: v_mov_b32_e32 v2, s16
258+
; CHECK-NEXT: s_subb_u32 s7, s17, s21
259+
; CHECK-NEXT: v_mov_b32_e32 v3, s17
260+
; CHECK-NEXT: v_mov_b32_e32 v8, s18
261+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
262+
; CHECK-NEXT: v_mov_b32_e32 v9, s19
263+
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
264+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[8:9]
265+
; CHECK-NEXT: v_mov_b32_e32 v4, s6
266+
; CHECK-NEXT: v_mov_b32_e32 v5, s7
267+
; CHECK-NEXT: v_mov_b32_e32 v6, s4
268+
; CHECK-NEXT: v_mov_b32_e32 v7, s5
269+
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
270+
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
271+
; CHECK-NEXT: v_mov_b32_e32 v0, v2
272+
; CHECK-NEXT: v_mov_b32_e32 v1, v2
273+
; CHECK-NEXT: v_mov_b32_e32 v2, v3
274+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
275+
; CHECK-NEXT: s_setpc_b64 s[30:31]
276+
%pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
277+
%val = extractvalue {<2 x i64>, <2 x i1>} %pair, 0
278+
%obit = extractvalue {<2 x i64>, <2 x i1>} %pair, 1
279+
%res = sext <2 x i1> %obit to <2 x i64>
280+
store <2 x i64> %val, ptr %ptrval
281+
ret <2 x i64> %res
282+
}
283+
284+
define i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
285+
; CHECK-LABEL: s_uadd_i64:
286+
; CHECK: ; %bb.0:
287+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288+
; CHECK-NEXT: s_add_u32 s4, s16, s18
289+
; CHECK-NEXT: v_mov_b32_e32 v2, s16
290+
; CHECK-NEXT: s_addc_u32 s5, s17, s19
291+
; CHECK-NEXT: v_mov_b32_e32 v3, s17
292+
; CHECK-NEXT: v_mov_b32_e32 v4, s4
293+
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
294+
; CHECK-NEXT: v_mov_b32_e32 v5, s5
295+
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
296+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
297+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
298+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
299+
; CHECK-NEXT: s_setpc_b64 s[30:31]
300+
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 %val1)
301+
%val = extractvalue {i64, i1} %pair, 0
302+
%obit = extractvalue {i64, i1} %pair, 1
303+
%res = sext i1 %obit to i64
304+
store i64 %val, ptr %ptrval
305+
ret i64 %res
306+
}
307+
308+
define i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
309+
; CHECK-LABEL: s_uadd_p1:
310+
; CHECK: ; %bb.0:
311+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312+
; CHECK-NEXT: s_add_u32 s4, s16, 1
313+
; CHECK-NEXT: s_addc_u32 s5, s17, 0
314+
; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0
315+
; CHECK-NEXT: v_mov_b32_e32 v2, s4
316+
; CHECK-NEXT: v_mov_b32_e32 v3, s5
317+
; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
318+
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
319+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
320+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
321+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
322+
; CHECK-NEXT: s_setpc_b64 s[30:31]
323+
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 1)
324+
%val = extractvalue {i64, i1} %pair, 0
325+
%obit = extractvalue {i64, i1} %pair, 1
326+
%res = sext i1 %obit to i64
327+
store i64 %val, ptr %ptrval
328+
ret i64 %res
329+
}
330+
331+
define i64 @s_uadd_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
332+
; CHECK-LABEL: s_uadd_n1:
333+
; CHECK: ; %bb.0:
334+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335+
; CHECK-NEXT: s_add_u32 s4, s16, -1
336+
; CHECK-NEXT: s_addc_u32 s5, s17, -1
337+
; CHECK-NEXT: s_cmp_lg_u64 s[16:17], 0
338+
; CHECK-NEXT: v_mov_b32_e32 v2, s4
339+
; CHECK-NEXT: v_mov_b32_e32 v3, s5
340+
; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
341+
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
342+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
343+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
344+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
345+
; CHECK-NEXT: s_setpc_b64 s[30:31]
346+
%pair = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %val0, i64 -1)
347+
%val = extractvalue {i64, i1} %pair, 0
348+
%obit = extractvalue {i64, i1} %pair, 1
349+
%res = sext i1 %obit to i64
350+
store i64 %val, ptr %ptrval
351+
ret i64 %res
352+
}
353+
354+
define i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
355+
; CHECK-LABEL: s_usub_p1:
356+
; CHECK: ; %bb.0:
357+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
358+
; CHECK-NEXT: s_add_u32 s4, s16, -1
359+
; CHECK-NEXT: v_mov_b32_e32 v2, s16
360+
; CHECK-NEXT: s_addc_u32 s5, s17, -1
361+
; CHECK-NEXT: v_mov_b32_e32 v3, s17
362+
; CHECK-NEXT: v_mov_b32_e32 v4, s4
363+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3]
364+
; CHECK-NEXT: v_mov_b32_e32 v5, s5
365+
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
366+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
367+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
368+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
369+
; CHECK-NEXT: s_setpc_b64 s[30:31]
370+
%pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 1)
371+
%val = extractvalue {i64, i1} %pair, 0
372+
%obit = extractvalue {i64, i1} %pair, 1
373+
%res = sext i1 %obit to i64
374+
store i64 %val, ptr %ptrval
375+
ret i64 %res
376+
}
377+
378+
define i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
379+
; CHECK-LABEL: s_usub_n1:
380+
; CHECK: ; %bb.0:
381+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
382+
; CHECK-NEXT: s_add_u32 s4, s16, 1
383+
; CHECK-NEXT: v_mov_b32_e32 v2, s16
384+
; CHECK-NEXT: s_addc_u32 s5, s17, 0
385+
; CHECK-NEXT: v_mov_b32_e32 v3, s17
386+
; CHECK-NEXT: v_mov_b32_e32 v4, s4
387+
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[2:3]
388+
; CHECK-NEXT: v_mov_b32_e32 v5, s5
389+
; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
390+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
391+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
392+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
393+
; CHECK-NEXT: s_setpc_b64 s[30:31]
394+
%pair = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %val0, i64 -1)
395+
%val = extractvalue {i64, i1} %pair, 0
396+
%obit = extractvalue {i64, i1} %pair, 1
397+
%res = sext i1 %obit to i64
398+
store i64 %val, ptr %ptrval
399+
ret i64 %res
400+
}

0 commit comments

Comments
 (0)