Skip to content

Commit d763be5

Browse files
AZero13mahesh-attarde
authored andcommitted
[AMDGPU][TargetLowering] Allow forming overflow op if it is legal (llvm#156266)
Because usubo and uaddo are legal in AMDGPU in 32 bits, we want to use it whenever possible.
1 parent 0b2f1bd commit d763be5

File tree

7 files changed

+301
-370
lines changed

7 files changed

+301
-370
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3455,6 +3455,10 @@ class LLVM_ABI TargetLoweringBase {
34553455
/// matching of other patterns.
34563456
virtual bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
34573457
bool MathUsed) const {
3458+
// Form it if it is legal.
3459+
if (isOperationLegal(Opcode, VT))
3460+
return true;
3461+
34583462
// TODO: The default logic is inherited from code in CodeGenPrepare.
34593463
// The opcode should not make a difference by default?
34603464
if (Opcode != ISD::UADDO)

llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -98,14 +98,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
9898
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
9999
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
100100
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
101-
; GFX7-NEXT: s_add_i32 s0, s2, s2
102-
; GFX7-NEXT: s_cmp_lt_u32 s0, s2
103-
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
104-
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
101+
; GFX7-NEXT: v_add_i32_e64 v0, s[0:1], s2, s2
105102
; GFX7-NEXT: s_or_b32 s0, s0, s1
106103
; GFX7-NEXT: s_cmp_lg_u32 s0, 0
107104
; GFX7-NEXT: s_addc_u32 s0, s2, 0
108-
; GFX7-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0
105+
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
106+
; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1]
109107
; GFX7-NEXT: s_cbranch_vccnz .LBB1_2
110108
; GFX7-NEXT: ; %bb.1: ; %bb0
111109
; GFX7-NEXT: v_mov_b32_e32 v0, 0
@@ -125,13 +123,11 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
125123
; GFX9: ; %bb.0: ; %bb
126124
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
127125
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
128-
; GFX9-NEXT: s_add_i32 s0, s2, s2
129-
; GFX9-NEXT: s_cmp_lt_u32 s0, s2
130-
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
126+
; GFX9-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, s2
131127
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
132-
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
133128
; GFX9-NEXT: s_addc_u32 s0, s2, 0
134-
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0
129+
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
130+
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
135131
; GFX9-NEXT: s_cbranch_vccnz .LBB1_2
136132
; GFX9-NEXT: ; %bb.1: ; %bb0
137133
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -151,13 +147,11 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
151147
; GFX10: ; %bb.0: ; %bb
152148
; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0
153149
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
154-
; GFX10-NEXT: s_add_i32 s1, s0, s0
155-
; GFX10-NEXT: s_cmp_lt_u32 s1, s0
156-
; GFX10-NEXT: s_cselect_b32 s1, -1, 0
157-
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
150+
; GFX10-NEXT: v_add_co_u32 v0, s1, s0, s0
158151
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
159152
; GFX10-NEXT: s_addc_u32 s0, s0, 0
160-
; GFX10-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v0
153+
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
154+
; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0
161155
; GFX10-NEXT: s_cbranch_vccnz .LBB1_2
162156
; GFX10-NEXT: ; %bb.1: ; %bb0
163157
; GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -177,15 +171,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
177171
; GFX11: ; %bb.0: ; %bb
178172
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
179173
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
180-
; GFX11-NEXT: s_add_i32 s1, s0, s0
181-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
182-
; GFX11-NEXT: s_cmp_lt_u32 s1, s0
183-
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
184-
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
174+
; GFX11-NEXT: v_add_co_u32 v0, s1, s0, s0
185175
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
186176
; GFX11-NEXT: s_addc_u32 s0, s0, 0
187-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
188-
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v0
177+
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
178+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
179+
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
189180
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
190181
; GFX11-NEXT: ; %bb.1: ; %bb0
191182
; GFX11-NEXT: v_mov_b32_e32 v0, 0

llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll

Lines changed: 24 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -8946,8 +8946,7 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
89468946
; GCN1-NEXT: .LBB141_1: ; %atomicrmw.start
89478947
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
89488948
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8949-
; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4
8950-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
8949+
; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, 1, v4
89518950
; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
89528951
; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
89538952
; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -8971,8 +8970,7 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
89718970
; GCN2-NEXT: .LBB141_1: ; %atomicrmw.start
89728971
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
89738972
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8974-
; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4
8975-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
8973+
; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4
89768974
; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
89778975
; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
89788976
; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -8996,9 +8994,8 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
89968994
; GCN3-NEXT: .LBB141_1: ; %atomicrmw.start
89978995
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
89988996
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8999-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
8997+
; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4
90008998
; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
9001-
; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
90028999
; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
90039000
; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
90049001
; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -9027,8 +9024,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
90279024
; GCN1-NEXT: .LBB142_1: ; %atomicrmw.start
90289025
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
90299026
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9030-
; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4
9031-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9027+
; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, 1, v4
90329028
; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
90339029
; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
90349030
; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9054,8 +9050,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
90549050
; GCN2-NEXT: .LBB142_1: ; %atomicrmw.start
90559051
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
90569052
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9057-
; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4
9058-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9053+
; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4
90599054
; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
90609055
; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
90619056
; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9079,9 +9074,8 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
90799074
; GCN3-NEXT: .LBB142_1: ; %atomicrmw.start
90809075
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
90819076
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9082-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9077+
; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4
90839078
; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
9084-
; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
90859079
; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
90869080
; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
90879081
; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
@@ -9110,8 +9104,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
91109104
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
91119105
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
91129106
; GCN1-NEXT: v_mov_b32_e32 v4, v3
9113-
; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4
9114-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9107+
; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, 1, v4
91159108
; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
91169109
; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
91179110
; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9136,8 +9129,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
91369129
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
91379130
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
91389131
; GCN2-NEXT: v_mov_b32_e32 v4, v3
9139-
; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4
9140-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9132+
; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4
91419133
; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
91429134
; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
91439135
; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
@@ -9162,9 +9154,8 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
91629154
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
91639155
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
91649156
; GCN3-NEXT: v_mov_b32_e32 v4, v3
9165-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9157+
; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4
91669158
; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
9167-
; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
91689159
; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
91699160
; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
91709161
; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -9194,8 +9185,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
91949185
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
91959186
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
91969187
; GCN1-NEXT: v_mov_b32_e32 v1, v0
9197-
; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1
9198-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
9188+
; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1
91999189
; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2
92009190
; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
92019191
; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -9221,8 +9211,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
92219211
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
92229212
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
92239213
; GCN2-NEXT: v_mov_b32_e32 v1, v0
9224-
; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1
9225-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
9214+
; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1
92269215
; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2
92279216
; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
92289217
; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -9246,9 +9235,8 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
92469235
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
92479236
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
92489237
; GCN3-NEXT: v_mov_b32_e32 v4, v3
9249-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9238+
; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4
92509239
; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
9251-
; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
92529240
; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
92539241
; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
92549242
; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
@@ -9279,8 +9267,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
92799267
; GCN1-NEXT: .LBB145_1: ; %atomicrmw.start
92809268
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
92819269
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9282-
; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3
9283-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9270+
; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, 1, v3
92849271
; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
92859272
; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
92869273
; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9307,8 +9294,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
93079294
; GCN2-NEXT: .LBB145_1: ; %atomicrmw.start
93089295
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
93099296
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9310-
; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3
9311-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9297+
; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, 1, v3
93129298
; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
93139299
; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
93149300
; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9335,9 +9321,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
93359321
; GCN3-NEXT: .LBB145_1: ; %atomicrmw.start
93369322
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
93379323
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9338-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9324+
; GCN3-NEXT: v_subrev_co_u32_e32 v2, vcc, 1, v3
93399325
; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
9340-
; GCN3-NEXT: v_add_u32_e32 v2, -1, v3
93419326
; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
93429327
; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
93439328
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
@@ -9369,8 +9354,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
93699354
; GCN1-NEXT: .LBB146_1: ; %atomicrmw.start
93709355
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
93719356
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9372-
; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3
9373-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9357+
; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, 1, v3
93749358
; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
93759359
; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
93769360
; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9399,8 +9383,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
93999383
; GCN2-NEXT: .LBB146_1: ; %atomicrmw.start
94009384
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
94019385
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9402-
; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3
9403-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9386+
; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, 1, v3
94049387
; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
94059388
; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
94069389
; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -9427,9 +9410,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
94279410
; GCN3-NEXT: .LBB146_1: ; %atomicrmw.start
94289411
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
94299412
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9430-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
9413+
; GCN3-NEXT: v_subrev_co_u32_e32 v2, vcc, 1, v3
94319414
; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
9432-
; GCN3-NEXT: v_add_u32_e32 v2, -1, v3
94339415
; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
94349416
; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
94359417
; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
@@ -9463,8 +9445,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
94639445
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
94649446
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
94659447
; GCN1-NEXT: v_mov_b32_e32 v5, v0
9466-
; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5
9467-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9448+
; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, 1, v5
94689449
; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
94699450
; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
94709451
; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9493,8 +9474,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
94939474
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
94949475
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
94959476
; GCN2-NEXT: v_mov_b32_e32 v5, v0
9496-
; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5
9497-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9477+
; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, 1, v5
94989478
; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
94999479
; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
95009480
; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9523,9 +9503,8 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
95239503
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
95249504
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
95259505
; GCN3-NEXT: v_mov_b32_e32 v5, v0
9526-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9506+
; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v5
95279507
; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
9528-
; GCN3-NEXT: v_add_u32_e32 v0, -1, v5
95299508
; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
95309509
; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
95319510
; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc
@@ -9557,8 +9536,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
95579536
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
95589537
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
95599538
; GCN1-NEXT: v_mov_b32_e32 v5, v0
9560-
; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5
9561-
; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9539+
; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, 1, v5
95629540
; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
95639541
; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
95649542
; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9587,8 +9565,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
95879565
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
95889566
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
95899567
; GCN2-NEXT: v_mov_b32_e32 v5, v0
9590-
; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5
9591-
; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9568+
; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, 1, v5
95929569
; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
95939570
; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
95949571
; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
@@ -9617,9 +9594,8 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
96179594
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
96189595
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
96199596
; GCN3-NEXT: v_mov_b32_e32 v5, v0
9620-
; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
9597+
; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v5
96219598
; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
9622-
; GCN3-NEXT: v_add_u32_e32 v0, -1, v5
96239599
; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
96249600
; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
96259601
; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] offset:16 glc

0 commit comments

Comments
 (0)