Skip to content

Commit 6fe58f4

Browse files
committed
[AMDGPU] Allow forming overflow op and folding abd to usubo if it is legal.
Because usubo and uaddo are legal in AMDGPU in 32 bits, we want to use it whenever possible.
1 parent a49030e commit 6fe58f4

File tree

8 files changed

+303
-371
lines changed

8 files changed

+303
-371
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3455,6 +3455,10 @@ class LLVM_ABI TargetLoweringBase {
34553455
/// matching of other patterns.
34563456
virtual bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
34573457
bool MathUsed) const {
3458+
// Form it if it is legal.
3459+
if (isOperationLegal(Opcode, VT))
3460+
return true;
3461+
34583462
// TODO: The default logic is inherited from code in CodeGenPrepare.
34593463
// The opcode should not make a difference by default?
34603464
if (Opcode != ISD::UADDO)

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9788,7 +9788,8 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
97889788
// flag if the (scalar) type is illegal as this is more likely to legalize
97899789
// cleanly:
97909790
// abdu(lhs, rhs) -> sub(xor(sub(lhs, rhs), uof(lhs, rhs)), uof(lhs, rhs))
9791-
if (!IsSigned && VT.isScalarInteger() && !isTypeLegal(VT)) {
9791+
if (!IsSigned && (isOperationLegal(ISD::USUBO, VT) ||
9792+
(VT.isScalarInteger() && !isTypeLegal(VT)))) {
97929793
SDValue USubO =
97939794
DAG.getNode(ISD::USUBO, dl, DAG.getVTList(VT, MVT::i1), {LHS, RHS});
97949795
SDValue Cmp = DAG.getNode(ISD::SIGN_EXTEND, dl, VT, USubO.getValue(1));

llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -98,14 +98,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
9898
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
9999
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
100100
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
101-
; GFX7-NEXT: s_add_i32 s0, s2, s2
102-
; GFX7-NEXT: s_cmp_lt_u32 s0, s2
103-
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
104-
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
101+
; GFX7-NEXT: v_add_i32_e64 v0, s[0:1], s2, s2
105102
; GFX7-NEXT: s_or_b32 s0, s0, s1
106103
; GFX7-NEXT: s_cmp_lg_u32 s0, 0
107104
; GFX7-NEXT: s_addc_u32 s0, s2, 0
108-
; GFX7-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0
105+
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
106+
; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1]
109107
; GFX7-NEXT: s_cbranch_vccnz .LBB1_2
110108
; GFX7-NEXT: ; %bb.1: ; %bb0
111109
; GFX7-NEXT: v_mov_b32_e32 v0, 0
@@ -125,13 +123,11 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
125123
; GFX9: ; %bb.0: ; %bb
126124
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
127125
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
128-
; GFX9-NEXT: s_add_i32 s0, s2, s2
129-
; GFX9-NEXT: s_cmp_lt_u32 s0, s2
130-
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
126+
; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, s2
131127
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
132-
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
133128
; GFX9-NEXT: s_addc_u32 s0, s2, 0
134-
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0
129+
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
130+
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
135131
; GFX9-NEXT: s_cbranch_vccnz .LBB1_2
136132
; GFX9-NEXT: ; %bb.1: ; %bb0
137133
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -151,13 +147,11 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
151147
; GFX10: ; %bb.0: ; %bb
152148
; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0
153149
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
154-
; GFX10-NEXT: s_add_i32 s1, s0, s0
155-
; GFX10-NEXT: s_cmp_lt_u32 s1, s0
156-
; GFX10-NEXT: s_cselect_b32 s1, -1, 0
157-
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
150+
; GFX10-NEXT: v_add_co_u32 v0, s1, s0, s0
158151
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
159152
; GFX10-NEXT: s_addc_u32 s0, s0, 0
160-
; GFX10-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v0
153+
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
154+
; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0
161155
; GFX10-NEXT: s_cbranch_vccnz .LBB1_2
162156
; GFX10-NEXT: ; %bb.1: ; %bb0
163157
; GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -177,15 +171,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
177171
; GFX11: ; %bb.0: ; %bb
178172
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
179173
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
180-
; GFX11-NEXT: s_add_i32 s1, s0, s0
181-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
182-
; GFX11-NEXT: s_cmp_lt_u32 s1, s0
183-
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
184-
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
174+
; GFX11-NEXT: v_add_co_u32 v0, s1, s0, s0
185175
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
186176
; GFX11-NEXT: s_addc_u32 s0, s0, 0
187-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
188-
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v0
177+
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
178+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
179+
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
189180
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
190181
; GFX11-NEXT: ; %bb.1: ; %bb0
191182
; GFX11-NEXT: v_mov_b32_e32 v0, 0

llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll

Lines changed: 24 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -8946,8 +8946,7 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
89468946
; GCN1-NEXT: .LBB141_1: ; %atomicrmw.start
89478947
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
89488948
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8949-
; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4
8950-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
8949+
; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, 1, v4
89518950
; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
89528951
; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
89538952
; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -8971,8 +8970,7 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
89718970
; GCN2-NEXT: .LBB141_1: ; %atomicrmw.start
89728971
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
89738972
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8974-
; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4
8975-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
8973+
; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4
89768974
; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
89778975
; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
89788976
; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -8996,9 +8994,8 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
89968994
; GCN3-NEXT: .LBB141_1: ; %atomicrmw.start
89978995
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
89988996
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8999-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
8997+
; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4
90008998
; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
9001-
; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
90028999
; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
90039000
; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
90049001
; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -9027,8 +9024,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
90279024
; GCN1-NEXT: .LBB142_1: ; %atomicrmw.start
90289025
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
90299026
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9030-
; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4
9031-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9027+
; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, 1, v4
90329028
; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
90339029
; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
90349030
; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9054,8 +9050,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
90549050
; GCN2-NEXT: .LBB142_1: ; %atomicrmw.start
90559051
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
90569052
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9057-
; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4
9058-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9053+
; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4
90599054
; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
90609055
; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
90619056
; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9079,9 +9074,8 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
90799074
; GCN3-NEXT: .LBB142_1: ; %atomicrmw.start
90809075
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
90819076
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9082-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9077+
; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4
90839078
; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
9084-
; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
90859079
; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
90869080
; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
90879081
; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
@@ -9110,8 +9104,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
91109104
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
91119105
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
91129106
; GCN1-NEXT: v_mov_b32_e32 v4, v3
9113-
; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4
9114-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9107+
; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, 1, v4
91159108
; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
91169109
; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
91179110
; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9136,8 +9129,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
91369129
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
91379130
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
91389131
; GCN2-NEXT: v_mov_b32_e32 v4, v3
9139-
; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4
9140-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9132+
; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4
91419133
; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
91429134
; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
91439135
; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9162,9 +9154,8 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
91629154
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
91639155
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
91649156
; GCN3-NEXT: v_mov_b32_e32 v4, v3
9165-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9157+
; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4
91669158
; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
9167-
; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
91689159
; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
91699160
; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
91709161
; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -9194,8 +9185,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
91949185
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
91959186
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
91969187
; GCN1-NEXT: v_mov_b32_e32 v1, v0
9197-
; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1
9198-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
9188+
; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1
91999189
; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2
92009190
; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
92019191
; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -9221,8 +9211,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
92219211
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
92229212
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
92239213
; GCN2-NEXT: v_mov_b32_e32 v1, v0
9224-
; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1
9225-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
9214+
; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1
92269215
; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2
92279216
; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
92289217
; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -9246,9 +9235,8 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
92469235
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
92479236
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
92489237
; GCN3-NEXT: v_mov_b32_e32 v4, v3
9249-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9238+
; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4
92509239
; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
9251-
; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
92529240
; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
92539241
; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
92549242
; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
@@ -9279,8 +9267,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
92799267
; GCN1-NEXT: .LBB145_1: ; %atomicrmw.start
92809268
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
92819269
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9282-
; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3
9283-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9270+
; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, 1, v3
92849271
; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
92859272
; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
92869273
; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9307,8 +9294,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
93079294
; GCN2-NEXT: .LBB145_1: ; %atomicrmw.start
93089295
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
93099296
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9310-
; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3
9311-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9297+
; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, 1, v3
93129298
; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
93139299
; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
93149300
; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9335,9 +9321,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
93359321
; GCN3-NEXT: .LBB145_1: ; %atomicrmw.start
93369322
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
93379323
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9338-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9324+
; GCN3-NEXT: v_subrev_co_u32_e32 v2, vcc, 1, v3
93399325
; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
9340-
; GCN3-NEXT: v_add_u32_e32 v2, -1, v3
93419326
; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
93429327
; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
93439328
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
@@ -9369,8 +9354,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
93699354
; GCN1-NEXT: .LBB146_1: ; %atomicrmw.start
93709355
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
93719356
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9372-
; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3
9373-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9357+
; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, 1, v3
93749358
; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
93759359
; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
93769360
; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9399,8 +9383,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
93999383
; GCN2-NEXT: .LBB146_1: ; %atomicrmw.start
94009384
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
94019385
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9402-
; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3
9403-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9386+
; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, 1, v3
94049387
; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
94059388
; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
94069389
; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9427,9 +9410,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
94279410
; GCN3-NEXT: .LBB146_1: ; %atomicrmw.start
94289411
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
94299412
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9430-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9413+
; GCN3-NEXT: v_subrev_co_u32_e32 v2, vcc, 1, v3
94319414
; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
9432-
; GCN3-NEXT: v_add_u32_e32 v2, -1, v3
94339415
; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
94349416
; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
94359417
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
@@ -9463,8 +9445,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
94639445
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
94649446
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
94659447
; GCN1-NEXT: v_mov_b32_e32 v5, v0
9466-
; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5
9467-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9448+
; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, 1, v5
94689449
; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
94699450
; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
94709451
; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9493,8 +9474,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
94939474
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
94949475
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
94959476
; GCN2-NEXT: v_mov_b32_e32 v5, v0
9496-
; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5
9497-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9477+
; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, 1, v5
94989478
; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
94999479
; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
95009480
; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9523,9 +9503,8 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
95239503
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
95249504
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
95259505
; GCN3-NEXT: v_mov_b32_e32 v5, v0
9526-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9506+
; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v5
95279507
; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
9528-
; GCN3-NEXT: v_add_u32_e32 v0, -1, v5
95299508
; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
95309509
; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
95319510
; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc
@@ -9557,8 +9536,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
95579536
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
95589537
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
95599538
; GCN1-NEXT: v_mov_b32_e32 v5, v0
9560-
; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5
9561-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9539+
; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, 1, v5
95629540
; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
95639541
; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
95649542
; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9587,8 +9565,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
95879565
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
95889566
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
95899567
; GCN2-NEXT: v_mov_b32_e32 v5, v0
9590-
; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5
9591-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9568+
; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, 1, v5
95929569
; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
95939570
; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
95949571
; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9617,9 +9594,8 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
96179594
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
96189595
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
96199596
; GCN3-NEXT: v_mov_b32_e32 v5, v0
9620-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9597+
; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v5
96219598
; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
9622-
; GCN3-NEXT: v_add_u32_e32 v0, -1, v5
96239599
; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
96249600
; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
96259601
; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] offset:16 glc

0 commit comments

Comments
 (0)