Skip to content

Commit f64b4d6

Browse files
LU-JOHNsvkeerthy
authored andcommitted
[AMDGPU] Remove setcc by using add/sub carryout (#155255)
Remove setcc instruction by utilizing add/sub carryout. Addresses #152992. --------- Signed-off-by: John Lu <[email protected]>
1 parent f67e4dc commit f64b4d6

File tree

14 files changed

+1538
-1473
lines changed

14 files changed

+1538
-1473
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1112,8 +1112,7 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
11121112
{N->getOperand(0), N->getOperand(1),
11131113
CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
11141114
} else {
1115-
unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1116-
: AMDGPU::S_USUBO_PSEUDO;
1115+
unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
11171116

11181117
CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
11191118
{N->getOperand(0), N->getOperand(1)});

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6073,9 +6073,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
60736073
MachineOperand &Src0 = MI.getOperand(2);
60746074
MachineOperand &Src1 = MI.getOperand(3);
60756075
MachineOperand &Src2 = MI.getOperand(4);
6076-
unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6077-
? AMDGPU::S_ADDC_U32
6078-
: AMDGPU::S_SUBB_U32;
60796076
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
60806077
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
60816078
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
@@ -6124,11 +6121,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
61246121
.addImm(0);
61256122
}
61266123

6127-
// clang-format off
6128-
BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
6129-
.add(Src0)
6130-
.add(Src1);
6131-
// clang-format on
6124+
unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6125+
? AMDGPU::S_ADDC_U32
6126+
: AMDGPU::S_SUBB_U32;
6127+
6128+
BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
61326129

61336130
unsigned SelOpc =
61346131
ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
@@ -16571,6 +16568,53 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
1657116568
}
1657216569
}
1657316570

16571+
// Eliminate setcc by using carryout from add/sub instruction
16572+
16573+
// LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
16574+
// setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
16575+
// similarly for subtraction
16576+
16577+
// LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
16578+
// setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
16579+
16580+
if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16581+
sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) ||
16582+
(CC == ISD::SETUGT &&
16583+
sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) ||
16584+
(CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16585+
sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16586+
bool IsAdd = LHS.getOpcode() == ISD::ADD;
16587+
16588+
SDValue Op0 = LHS.getOperand(0);
16589+
SDValue Op1 = LHS.getOperand(1);
16590+
16591+
SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
16592+
SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
16593+
16594+
SDValue Op0Hi = getHiHalf64(Op0, DAG);
16595+
SDValue Op1Hi = getHiHalf64(Op1, DAG);
16596+
16597+
SDValue NodeLo =
16598+
DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
16599+
DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16600+
16601+
SDValue CarryInHi = NodeLo.getValue(1);
16602+
SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
16603+
SL, DAG.getVTList(MVT::i32, MVT::i1),
16604+
{Op0Hi, Op1Hi, CarryInHi});
16605+
16606+
SDValue ResultLo = NodeLo.getValue(0);
16607+
SDValue ResultHi = NodeHi.getValue(0);
16608+
16609+
SDValue JoinedResult =
16610+
DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
16611+
16612+
SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
16613+
SDValue Overflow = NodeHi.getValue(1);
16614+
DCI.CombineTo(LHS.getNode(), Result);
16615+
return Overflow;
16616+
}
16617+
1657416618
if (VT != MVT::f32 && VT != MVT::f64 &&
1657516619
(!Subtarget->has16BitInsts() || VT != MVT::f16))
1657616620
return SDValue();

llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll

Lines changed: 42 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -8759,9 +8759,8 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
87598759
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
87608760
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
87618761
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
8762-
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
8763-
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
87648762
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
8763+
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
87658764
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
87668765
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
87678766
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -8780,20 +8779,19 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
87808779
; GFX90A-NEXT: s_cbranch_execz .LBB113_6
87818780
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
87828781
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
8783-
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
8784-
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
8785-
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
8782+
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
8783+
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
8784+
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
87868785
; GFX90A-NEXT: s_waitcnt vmcnt(1)
8787-
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6
8786+
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v1, v6
87888787
; GFX90A-NEXT: s_waitcnt vmcnt(0)
8789-
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
8790-
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
8791-
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
8792-
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
8793-
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
8794-
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
8795-
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
8796-
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
8788+
; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v7, vcc
8789+
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
8790+
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
8791+
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
8792+
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
8793+
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
8794+
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
87978795
; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi
87988796
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
87998797
; GFX90A-NEXT: ;;#ASMSTART
@@ -8827,10 +8825,9 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
88278825
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
88288826
; GFX950-NEXT: s_nop 1
88298827
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
8830-
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
88318828
; GFX950-NEXT: s_nop 1
8832-
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
88338829
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
8830+
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
88348831
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
88358832
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
88368833
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -8856,11 +8853,11 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
88568853
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6
88578854
; GFX950-NEXT: s_nop 1
88588855
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
8859-
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
88608856
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
8861-
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
8857+
; GFX950-NEXT: s_nop 0
88628858
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
88638859
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
8860+
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
88648861
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
88658862
; GFX950-NEXT: .LBB113_6: ; %atomicrmw.phi
88668863
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
@@ -8900,9 +8897,8 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
89008897
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
89018898
; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
89028899
; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
8903-
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
8904-
; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
89058900
; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
8901+
; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
89068902
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
89078903
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
89088904
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
@@ -8918,18 +8914,17 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
89188914
; GFX90A-NEXT: s_cbranch_execz .LBB114_6
89198915
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
89208916
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
8921-
; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
8922-
; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
8923-
; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
8917+
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
8918+
; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
8919+
; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
89248920
; GFX90A-NEXT: s_waitcnt vmcnt(1)
8925-
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2
8921+
; GFX90A-NEXT: v_sub_co_u32_e32 v1, vcc, v4, v2
89268922
; GFX90A-NEXT: s_waitcnt vmcnt(0)
8927-
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v3, vcc
8928-
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
8929-
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
8923+
; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v3, vcc
89308924
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
8931-
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
8932-
; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
8925+
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
8926+
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
8927+
; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
89338928
; GFX90A-NEXT: .LBB114_6: ; %atomicrmw.phi
89348929
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
89358930
; GFX90A-NEXT: ;;#ASMSTART
@@ -8962,10 +8957,9 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
89628957
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
89638958
; GFX950-NEXT: s_nop 1
89648959
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
8965-
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
89668960
; GFX950-NEXT: s_nop 1
8967-
; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
89688961
; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
8962+
; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
89698963
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
89708964
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
89718965
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
@@ -8988,7 +8982,6 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 {
89888982
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
89898983
; GFX950-NEXT: s_nop 1
89908984
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
8991-
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
89928985
; GFX950-NEXT: s_nop 1
89938986
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
89948987
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -17064,9 +17057,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
1706417057
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1706517058
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
1706617059
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
17067-
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
17068-
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
1706917060
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
17061+
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
1707017062
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
1707117063
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1707217064
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17085,20 +17077,19 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
1708517077
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
1708617078
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
1708717079
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
17088-
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
17089-
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
17090-
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
17080+
; GFX90A-NEXT: v_mov_b32_e32 v0, s4
17081+
; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
17082+
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
1709117083
; GFX90A-NEXT: s_waitcnt vmcnt(1)
17092-
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
17084+
; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v1, v4
1709317085
; GFX90A-NEXT: s_waitcnt vmcnt(0)
17094-
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
17095-
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
17096-
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
17097-
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
17098-
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
17099-
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
17100-
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
17101-
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
17086+
; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
17087+
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
17088+
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
17089+
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
17090+
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
17091+
; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
17092+
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
1710217093
; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi
1710317094
; GFX90A-NEXT: ;;#ASMSTART
1710417095
; GFX90A-NEXT: ; use a[0:1]
@@ -17131,10 +17122,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
1713117122
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
1713217123
; GFX950-NEXT: s_nop 1
1713317124
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
17134-
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
1713517125
; GFX950-NEXT: s_nop 1
17136-
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
1713717126
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
17127+
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
1713817128
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
1713917129
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1714017130
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
@@ -17158,11 +17148,11 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
1715817148
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
1715917149
; GFX950-NEXT: s_nop 1
1716017150
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
17161-
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
1716217151
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
17163-
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
17152+
; GFX950-NEXT: s_nop 0
1716417153
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
1716517154
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
17155+
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
1716617156
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
1716717157
; GFX950-NEXT: .LBB221_6: ; %atomicrmw.phi
1716817158
; GFX950-NEXT: ;;#ASMSTART
@@ -17201,9 +17191,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
1720117191
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
1720217192
; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
1720317193
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
17204-
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
17205-
; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
1720617194
; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
17195+
; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
1720717196
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc
1720817197
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1720917198
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
@@ -17226,7 +17215,6 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
1722617215
; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
1722717216
; GFX90A-NEXT: s_waitcnt vmcnt(0)
1722817217
; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
17229-
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
1723017218
; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1723117219
; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
1723217220
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
@@ -17262,10 +17250,9 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
1726217250
; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0
1726317251
; GFX950-NEXT: s_nop 1
1726417252
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc
17265-
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
1726617253
; GFX950-NEXT: s_nop 1
17267-
; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
1726817254
; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc
17255+
; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc
1726917256
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0
1727017257
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1727117258
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
@@ -17286,7 +17273,6 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 {
1728617273
; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
1728717274
; GFX950-NEXT: s_nop 1
1728817275
; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
17289-
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
1729017276
; GFX950-NEXT: s_nop 1
1729117277
; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
1729217278
; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc

0 commit comments

Comments
 (0)