@@ -8760,8 +8760,9 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
87608760; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
87618761; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
87628762; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
8763- ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
8763+ ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
87648764; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
8765+ ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
87658766; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
87668767; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
87678768; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -8780,19 +8781,20 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
87808781; GFX90A-NEXT: s_cbranch_execz .LBB113_6
87818782; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
87828783; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
8783- ; GFX90A-NEXT: v_cndmask_b32_e32 v0 , -1, v4, vcc
8784- ; GFX90A-NEXT: buffer_load_dword v1, v0 , s[0:3], 0 offen
8785- ; GFX90A-NEXT: buffer_load_dword v2, v0 , s[0:3], 0 offen offset:4
8784+ ; GFX90A-NEXT: v_cndmask_b32_e32 v4 , -1, v4, vcc
8785+ ; GFX90A-NEXT: buffer_load_dword v0, v4 , s[0:3], 0 offen
8786+ ; GFX90A-NEXT: buffer_load_dword v1, v4 , s[0:3], 0 offen offset:4
87868787; GFX90A-NEXT: s_waitcnt vmcnt(1)
8787- ; GFX90A-NEXT: v_sub_co_u32_e32 v3 , vcc, v1 , v6
8788+ ; GFX90A-NEXT: v_sub_co_u32_e32 v2 , vcc, v0 , v6
87888789; GFX90A-NEXT: s_waitcnt vmcnt(0)
8789- ; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v7, vcc
8790- ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
8791- ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
8792- ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
8793- ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
8794- ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
8795- ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
8790+ ; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
8791+ ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
8792+ ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
8793+ ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
8794+ ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
8795+ ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
8796+ ; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
8797+ ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
87968798; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi
87978799; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
87988800; GFX90A-NEXT: ;;#ASMSTART
@@ -8826,9 +8828,10 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
88268828; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
88278829; GFX950-NEXT: s_nop 1
88288830; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
8831+ ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
88298832; GFX950-NEXT: s_nop 1
8830- ; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
88318833; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
8834+ ; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
88328835; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
88338836; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
88348837; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -8854,11 +8857,11 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
88548857; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6
88558858; GFX950-NEXT: s_nop 1
88568859; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
8860+ ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
88578861; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
8858- ; GFX950-NEXT: s_nop 0
8862+ ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
88598863; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
88608864; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
8861- ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
88628865; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
88638866; GFX950-NEXT: .LBB113_6: ; %atomicrmw.phi
88648867; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
@@ -8898,8 +8901,9 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
88988901; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
88998902; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
89008903; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
8901- ; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
8904+ ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
89028905; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
8906+ ; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
89038907; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
89048908; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
89058909; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
@@ -8915,17 +8919,18 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
89158919; GFX90A-NEXT: s_cbranch_execz .LBB114_6
89168920; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
89178921; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
8918- ; GFX90A-NEXT: v_cndmask_b32_e32 v0 , -1, v0, vcc
8919- ; GFX90A-NEXT: buffer_load_dword v4, v0 , s[0:3], 0 offen
8920- ; GFX90A-NEXT: buffer_load_dword v5, v0 , s[0:3], 0 offen offset:4
8922+ ; GFX90A-NEXT: v_cndmask_b32_e32 v6 , -1, v0, vcc
8923+ ; GFX90A-NEXT: buffer_load_dword v4, v6 , s[0:3], 0 offen
8924+ ; GFX90A-NEXT: buffer_load_dword v5, v6 , s[0:3], 0 offen offset:4
89218925; GFX90A-NEXT: s_waitcnt vmcnt(1)
8922- ; GFX90A-NEXT: v_sub_co_u32_e32 v1 , vcc, v4, v2
8926+ ; GFX90A-NEXT: v_sub_co_u32_e32 v0 , vcc, v4, v2
89238927; GFX90A-NEXT: s_waitcnt vmcnt(0)
8924- ; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v3, vcc
8928+ ; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v3, vcc
8929+ ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
8930+ ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
89258931; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
8926- ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
8927- ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
8928- ; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
8932+ ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
8933+ ; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
89298934; GFX90A-NEXT: .LBB114_6: ; %atomicrmw.phi
89308935; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
89318936; GFX90A-NEXT: ;;#ASMSTART
@@ -8958,9 +8963,10 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
89588963; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
89598964; GFX950-NEXT: s_nop 1
89608965; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
8966+ ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
89618967; GFX950-NEXT: s_nop 1
8962- ; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
89638968; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
8969+ ; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
89648970; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
89658971; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
89668972; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
@@ -8983,6 +8989,7 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
89838989; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
89848990; GFX950-NEXT: s_nop 1
89858991; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
8992+ ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
89868993; GFX950-NEXT: s_nop 1
89878994; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
89888995; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -17058,8 +17065,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
1705817065; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1705917066; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
1706017067; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
17061- ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
17068+ ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
1706217069; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
17070+ ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1706317071; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
1706417072; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1706517073; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17078,19 +17086,20 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
1707817086; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
1707917087; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
1708017088; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
17081- ; GFX90A-NEXT: v_mov_b32_e32 v0 , s4
17082- ; GFX90A-NEXT: buffer_load_dword v1, v0 , s[0:3], 0 offen
17083- ; GFX90A-NEXT: buffer_load_dword v2, v0 , s[0:3], 0 offen offset:4
17089+ ; GFX90A-NEXT: v_mov_b32_e32 v6 , s4
17090+ ; GFX90A-NEXT: buffer_load_dword v0, v6 , s[0:3], 0 offen
17091+ ; GFX90A-NEXT: buffer_load_dword v1, v6 , s[0:3], 0 offen offset:4
1708417092; GFX90A-NEXT: s_waitcnt vmcnt(1)
17085- ; GFX90A-NEXT: v_sub_co_u32_e32 v3 , vcc, v1 , v4
17093+ ; GFX90A-NEXT: v_sub_co_u32_e32 v2 , vcc, v0 , v4
1708617094; GFX90A-NEXT: s_waitcnt vmcnt(0)
17087- ; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
17088- ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
17089- ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
17090- ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
17091- ; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
17092- ; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
17093- ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
17095+ ; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
17096+ ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
17097+ ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
17098+ ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
17099+ ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
17100+ ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
17101+ ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
17102+ ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
1709417103; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi
1709517104; GFX90A-NEXT: ;;#ASMSTART
1709617105; GFX90A-NEXT: ; use a[0:1]
@@ -17123,9 +17132,10 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
1712317132; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
1712417133; GFX950-NEXT: s_nop 1
1712517134; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
17135+ ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
1712617136; GFX950-NEXT: s_nop 1
17127- ; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1712817137; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
17138+ ; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1712917139; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
1713017140; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1713117141; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17149,11 +17159,11 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
1714917159; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
1715017160; GFX950-NEXT: s_nop 1
1715117161; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
17162+ ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
1715217163; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
17153- ; GFX950-NEXT: s_nop 0
17164+ ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
1715417165; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
1715517166; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
17156- ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
1715717167; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
1715817168; GFX950-NEXT: .LBB221_6: ; %atomicrmw.phi
1715917169; GFX950-NEXT: ;;#ASMSTART
@@ -17192,8 +17202,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
1719217202; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
1719317203; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
1719417204; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
17195- ; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
17205+ ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
1719617206; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
17207+ ; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
1719717208; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc
1719817209; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1719917210; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
@@ -17216,6 +17227,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
1721617227; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
1721717228; GFX90A-NEXT: s_waitcnt vmcnt(0)
1721817229; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
17230+ ; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
1721917231; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1722017232; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
1722117233; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
@@ -17251,9 +17263,10 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
1725117263; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
1725217264; GFX950-NEXT: s_nop 1
1725317265; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
17266+ ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
1725417267; GFX950-NEXT: s_nop 1
17255- ; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
1725617268; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
17269+ ; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
1725717270; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
1725817271; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1725917272; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
@@ -17274,6 +17287,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
1727417287; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
1727517288; GFX950-NEXT: s_nop 1
1727617289; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
17290+ ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
1727717291; GFX950-NEXT: s_nop 1
1727817292; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
1727917293; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
0 commit comments