Skip to content

Commit b47c28f

Browse files
committed
Remove setcc by using add/sub carryout
Signed-off-by: John Lu <[email protected]>
1 parent c82304e commit b47c28f

File tree

9 files changed

+736
-737
lines changed

9 files changed

+736
-737
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16579,6 +16579,62 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
1657916579
}
1658016580
}
1658116581

16582+
// Eliminate setcc by using carryout from add/sub instruction
16583+
16584+
// X = ADD i64 Y, Z Xlo = UADDO i32 Ylo, Zlo
16585+
// setcc X ult Y -> XHi = UADDO_CARRY i32 Yhi, Zhi
16586+
// similarly for subtraction
16587+
16588+
// X = ADD i64 Y, 1 Xlo = UADDO i32 Ylo, 1
16589+
// setcc X eq 0 -> XHi = UADDO_CARRY i32 Yhi, 0
16590+
16591+
// Don't split a 64-bit add/sub into two 32-bit add/sub instructions for
16592+
// non-divergent operations. This can result in lo/hi 32-bit operations
16593+
// being done in SGPR and VGPR with additional operations being needed
16594+
// to move operands and/or generate the intermediate carry.
16595+
if (VT == MVT::i64 && N->isDivergent() &&
16596+
((((LHS.getOpcode() == ISD::ADD && CC == ISD::SETULT) ||
16597+
(LHS.getOpcode() == ISD::SUB && CC == ISD::SETUGT)) &&
16598+
LHS.getOperand(0) == RHS) ||
16599+
(LHS.getOpcode() == ISD::ADD && CC == ISD::SETEQ && CRHS &&
16600+
CRHS->isZero() && dyn_cast<ConstantSDNode>(LHS.getOperand(1)) &&
16601+
dyn_cast<ConstantSDNode>(LHS.getOperand(1))->isOne()))) {
16602+
EVT TargetType = MVT::i32;
16603+
EVT CarryVT = MVT::i1;
16604+
const SDValue One = DAG.getConstant(1, SL, TargetType);
16605+
bool IsAdd = LHS.getOpcode() == ISD::ADD;
16606+
16607+
SDValue Op0 = LHS.getOperand(0);
16608+
SDValue Op1 = LHS.getOperand(1);
16609+
16610+
SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, Op0);
16611+
SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, Op1);
16612+
16613+
SDValue Op0Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, TargetType, Op0, One);
16614+
SDValue Op1Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, TargetType, Op1, One);
16615+
16616+
SDValue NodeLo =
16617+
DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
16618+
DAG.getVTList(TargetType, CarryVT), {Op0Lo, Op1Lo});
16619+
16620+
SDValue CarryInHi = SDValue(NodeLo.getNode(), 1);
16621+
SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
16622+
SL, DAG.getVTList(TargetType, CarryVT),
16623+
{Op0Hi, Op1Hi, CarryInHi});
16624+
16625+
SDValue ResultLo = SDValue(NodeLo.getNode(), 0);
16626+
SDValue ResultHi = SDValue(NodeHi.getNode(), 0);
16627+
16628+
EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
16629+
SDValue JoinedResult =
16630+
DAG.getBuildVector(ConcatType, SL, {ResultLo, ResultHi});
16631+
16632+
SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
16633+
SDValue Overflow = SDValue(NodeHi.getNode(), 1);
16634+
DCI.CombineTo(LHS.getNode(), Result);
16635+
return Overflow;
16636+
}
16637+
1658216638
if (VT != MVT::f32 && VT != MVT::f64 &&
1658316639
(!Subtarget->has16BitInsts() || VT != MVT::f16))
1658416640
return SDValue();

llvm/test/CodeGen/AMDGPU/addsub64_carry.ll

Lines changed: 21 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,9 @@ define %struct.uint96 @v_add64_32(i64 %val64A, i64 %val64B, i32 %val32) {
1717
; CHECK-LABEL: v_add64_32:
1818
; CHECK: ; %bb.0:
1919
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20-
; CHECK-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
21-
; CHECK-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v3, vcc
22-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[0:1]
23-
; CHECK-NEXT: v_mov_b32_e32 v0, v5
20+
; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
21+
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
2422
; CHECK-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
25-
; CHECK-NEXT: v_mov_b32_e32 v1, v6
2623
; CHECK-NEXT: s_setpc_b64 s[30:31]
2724
%sum64 = add i64 %val64A, %val64B
2825
%obit = icmp ult i64 %sum64, %val64A
@@ -38,16 +35,14 @@ define <2 x i64> @v_uadd_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
3835
; CHECK: ; %bb.0:
3936
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4037
; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, v2, v6
38+
; CHECK-NEXT: v_add_co_u32_e64 v4, s[4:5], v0, v4
4139
; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
42-
; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, v0, v4
43-
; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
44-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
45-
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
46-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
47-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
48-
; CHECK-NEXT: v_mov_b32_e32 v1, v0
40+
; CHECK-NEXT: v_addc_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
41+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
4942
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
43+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
5044
; CHECK-NEXT: v_mov_b32_e32 v3, v2
45+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
5146
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5247
; CHECK-NEXT: s_setpc_b64 s[30:31]
5348
%pair = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
@@ -63,16 +58,14 @@ define <2 x i64> @v_usub_v2i64(<2 x i64> %val0, <2 x i64> %val1, ptr %ptrval) {
6358
; CHECK: ; %bb.0:
6459
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6560
; CHECK-NEXT: v_sub_co_u32_e32 v6, vcc, v2, v6
61+
; CHECK-NEXT: v_sub_co_u32_e64 v4, s[4:5], v0, v4
6662
; CHECK-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
67-
; CHECK-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v4
68-
; CHECK-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
69-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[0:1]
70-
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
71-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
72-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
73-
; CHECK-NEXT: v_mov_b32_e32 v1, v0
63+
; CHECK-NEXT: v_subb_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
64+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
7465
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
66+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
7567
; CHECK-NEXT: v_mov_b32_e32 v3, v2
68+
; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
7669
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7770
; CHECK-NEXT: s_setpc_b64 s[30:31]
7871
%pair = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %val0, <2 x i64> %val1)
@@ -87,10 +80,9 @@ define i64 @v_uadd_i64(i64 %val0, i64 %val1, ptr %ptrval) {
8780
; CHECK-LABEL: v_uadd_i64:
8881
; CHECK: ; %bb.0:
8982
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90-
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
91-
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
92-
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
93-
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
83+
; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
84+
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
85+
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
9486
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
9587
; CHECK-NEXT: v_mov_b32_e32 v1, v0
9688
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -109,7 +101,6 @@ define i64 @v_uadd_p1(i64 %val0, i64 %val1, ptr %ptrval) {
109101
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110102
; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
111103
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
112-
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
113104
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
114105
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
115106
; CHECK-NEXT: v_mov_b32_e32 v1, v0
@@ -147,10 +138,9 @@ define i64 @v_usub_p1(i64 %val0, i64 %val1, ptr %ptrval) {
147138
; CHECK-LABEL: v_usub_p1:
148139
; CHECK: ; %bb.0:
149140
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150-
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
151-
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
152-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
153-
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
141+
; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v0
142+
; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
143+
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
154144
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
155145
; CHECK-NEXT: v_mov_b32_e32 v1, v0
156146
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -167,10 +157,9 @@ define i64 @v_usub_n1(i64 %val0, i64 %val1, ptr %ptrval) {
167157
; CHECK-LABEL: v_usub_n1:
168158
; CHECK: ; %bb.0:
169159
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
170-
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0
171-
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
172-
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
173-
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
160+
; CHECK-NEXT: v_subrev_co_u32_e32 v0, vcc, -1, v0
161+
; CHECK-NEXT: v_subbrev_co_u32_e32 v1, vcc, -1, v1, vcc
162+
; CHECK-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
174163
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
175164
; CHECK-NEXT: v_mov_b32_e32 v1, v0
176165
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)

llvm/test/CodeGen/AMDGPU/carryout-selection.ll

Lines changed: 34 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -841,7 +841,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
841841
; GCN-ISEL-LABEL: name: vuaddo64
842842
; GCN-ISEL-LABEL: body:
843843
; GCN-ISEL-LABEL: bb.0
844-
; GCN-ISEL: V_ADD_U64_PSEUDO
844+
; GCN-ISEL: V_ADD_CO_U32_e64
845845

846846
define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
847847
; CISI-LABEL: vuaddo64:
@@ -854,9 +854,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
854854
; CISI-NEXT: s_mov_b32 s4, s0
855855
; CISI-NEXT: v_mov_b32_e32 v1, s9
856856
; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
857-
; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
858-
; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
859857
; CISI-NEXT: s_mov_b32 s5, s1
858+
; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
860859
; CISI-NEXT: s_mov_b32 s0, s2
861860
; CISI-NEXT: s_mov_b32 s1, s3
862861
; CISI-NEXT: s_mov_b32 s2, s6
@@ -876,7 +875,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
876875
; VI-NEXT: v_mov_b32_e32 v6, s5
877876
; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v0
878877
; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
879-
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
880878
; VI-NEXT: v_mov_b32_e32 v2, s1
881879
; VI-NEXT: v_mov_b32_e32 v3, s2
882880
; VI-NEXT: v_mov_b32_e32 v4, s3
@@ -894,7 +892,6 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
894892
; GFX9-NEXT: v_mov_b32_e32 v1, s7
895893
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
896894
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
897-
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
898895
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
899896
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
900897
; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
@@ -909,8 +906,7 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
909906
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
910907
; GFX1010-NEXT: v_add_co_u32 v0, s4, s6, v0
911908
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
912-
; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
913-
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
909+
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
914910
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
915911
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
916912
; GFX1010-NEXT: s_endpgm
@@ -923,9 +919,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
923919
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
924920
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
925921
; GFX1030W32-NEXT: v_add_co_u32 v0, s4, s6, v0
926-
; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4
927-
; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
928-
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
922+
; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
923+
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
929924
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
930925
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
931926
; GFX1030W32-NEXT: s_endpgm
@@ -938,9 +933,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
938933
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
939934
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
940935
; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s6, v0
941-
; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
942-
; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
943-
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
936+
; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
937+
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
944938
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
945939
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
946940
; GFX1030W64-NEXT: s_endpgm
@@ -955,10 +949,9 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
955949
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
956950
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
957951
; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0
958-
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4
952+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
959953
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
960-
; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
961-
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
954+
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
962955
; GFX11-NEXT: s_clause 0x1
963956
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
964957
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
@@ -969,16 +962,17 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
969962
; GFX1250-NEXT: s_clause 0x1
970963
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
971964
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
972-
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
973965
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
966+
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
974967
; GFX1250-NEXT: s_wait_kmcnt 0x0
975-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
976-
; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
977-
; GFX1250-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[2:3]
978-
; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
968+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
969+
; GFX1250-NEXT: v_add_co_u32 v0, s4, s6, v0
970+
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
971+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
972+
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
979973
; GFX1250-NEXT: s_clause 0x1
980-
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
981-
; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
974+
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
975+
; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
982976
; GFX1250-NEXT: s_endpgm
983977
%tid = call i32 @llvm.amdgcn.workitem.id.x()
984978
%tid.ext = sext i32 %tid to i64
@@ -1821,7 +1815,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18211815
; GCN-ISEL-LABEL: name: vusubo64
18221816
; GCN-ISEL-LABEL: body:
18231817
; GCN-ISEL-LABEL: bb.0
1824-
; GCN-ISEL: V_SUB_U64_PSEUDO
1818+
; GCN-ISEL: V_SUBB_U32_e64
18251819

18261820
define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
18271821
; CISI-LABEL: vusubo64:
@@ -1834,9 +1828,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18341828
; CISI-NEXT: s_mov_b32 s4, s0
18351829
; CISI-NEXT: v_mov_b32_e32 v1, s9
18361830
; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
1837-
; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
1838-
; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
18391831
; CISI-NEXT: s_mov_b32 s5, s1
1832+
; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
18401833
; CISI-NEXT: s_mov_b32 s0, s2
18411834
; CISI-NEXT: s_mov_b32 s1, s3
18421835
; CISI-NEXT: s_mov_b32 s2, s6
@@ -1856,7 +1849,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18561849
; VI-NEXT: v_mov_b32_e32 v6, s5
18571850
; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v0
18581851
; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
1859-
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[5:6]
18601852
; VI-NEXT: v_mov_b32_e32 v2, s1
18611853
; VI-NEXT: v_mov_b32_e32 v3, s2
18621854
; VI-NEXT: v_mov_b32_e32 v4, s3
@@ -1874,7 +1866,6 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18741866
; GFX9-NEXT: v_mov_b32_e32 v1, s7
18751867
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
18761868
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
1877-
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
18781869
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
18791870
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
18801871
; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
@@ -1889,8 +1880,7 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18891880
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
18901881
; GFX1010-NEXT: v_sub_co_u32 v0, s4, s6, v0
18911882
; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
1892-
; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
1893-
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1883+
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
18941884
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
18951885
; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
18961886
; GFX1010-NEXT: s_endpgm
@@ -1903,9 +1893,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
19031893
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
19041894
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
19051895
; GFX1030W32-NEXT: v_sub_co_u32 v0, s4, s6, v0
1906-
; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4
1907-
; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
1908-
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1896+
; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
1897+
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
19091898
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
19101899
; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
19111900
; GFX1030W32-NEXT: s_endpgm
@@ -1918,9 +1907,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
19181907
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
19191908
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
19201909
; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s6, v0
1921-
; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
1922-
; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
1923-
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1910+
; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, s[4:5], s7, 0, s[4:5]
1911+
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
19241912
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
19251913
; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
19261914
; GFX1030W64-NEXT: s_endpgm
@@ -1935,10 +1923,9 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
19351923
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
19361924
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
19371925
; GFX11-NEXT: v_sub_co_u32 v0, s4, s6, v0
1938-
; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4
1926+
; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
19391927
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1940-
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
1941-
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1928+
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
19421929
; GFX11-NEXT: s_clause 0x1
19431930
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
19441931
; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
@@ -1949,16 +1936,17 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
19491936
; GFX1250-NEXT: s_clause 0x1
19501937
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
19511938
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1952-
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
19531939
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1940+
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
19541941
; GFX1250-NEXT: s_wait_kmcnt 0x0
1955-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1956-
; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[6:7], v[0:1]
1957-
; GFX1250-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[2:3]
1958-
; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
1942+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1943+
; GFX1250-NEXT: v_sub_co_u32 v0, s4, s6, v0
1944+
; GFX1250-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
1945+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
1946+
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
19591947
; GFX1250-NEXT: s_clause 0x1
1960-
; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1]
1961-
; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3]
1948+
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1949+
; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3]
19621950
; GFX1250-NEXT: s_endpgm
19631951
%tid = call i32 @llvm.amdgcn.workitem.id.x()
19641952
%tid.ext = sext i32 %tid to i64

0 commit comments

Comments
 (0)