Skip to content

Commit 7c30e38

Browse files
committed
Do not generate S_CMP if add/sub carryout is available
Signed-off-by: John Lu <[email protected]>
1 parent 1195022 commit 7c30e38

File tree

1 file changed

+93
-14
lines changed

1 file changed

+93
-14
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 93 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6081,9 +6081,9 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
60816081
MachineOperand &Src0 = MI.getOperand(2);
60826082
MachineOperand &Src1 = MI.getOperand(3);
60836083
MachineOperand &Src2 = MI.getOperand(4);
6084-
unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6085-
? AMDGPU::S_ADDC_U32
6086-
: AMDGPU::S_SUBB_U32;
6084+
6085+
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO);
6086+
60876087
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
60886088
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
60896089
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
@@ -6103,6 +6103,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
61036103
Src2.setReg(RegOp2);
61046104
}
61056105

6106+
<<<<<<< HEAD
61066107
if (ST.isWave64()) {
61076108
if (ST.hasScalarCompareEq64()) {
61086109
BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
@@ -6140,6 +6141,89 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
61406141

61416142
unsigned SelOpc =
61426143
ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6144+
=======
6145+
const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6146+
unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
6147+
assert(WaveSize == 64 || WaveSize == 32);
6148+
6149+
unsigned SelOpc =
6150+
(WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6151+
unsigned AddcSubbOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
6152+
unsigned AddSubOpc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32;
6153+
// Lowering for:
6154+
//
6155+
// S_UADDO_PSEUDO|S_ADD_CO_PSEUDO
6156+
// <no SCC def code>
6157+
// S_ADD_CO_PSEUDO
6158+
//
6159+
// produces:
6160+
//
6161+
// S_ADD_I32|S_ADDC_U32 ; lowered from S_UADDO_PSEUDO
6162+
// SREG = S_CSELECT_B32|64 [1,-1], 0 ; lowered from S_UADDO_PSEUDO
6163+
// <no SCC def code>
6164+
// S_CMP32|64 SREG, 0 ; lowered from S_ADD_CO_PSEUDO
6165+
// S_ADDC_U32 ; lowered from S_ADD_CO_PSEUDO
6166+
//
6167+
// At this point before generating the S_CMP check if it is redundant. If
6168+
// so do not recalculate it. Subsequent optimizations will also delete the
6169+
// dead S_CSELECT*.
6170+
6171+
bool RecalculateSCC{true};
6172+
MachineInstr *Def = MRI.getVRegDef(Src2.getReg());
6173+
if (Def && Def->getParent() == BB && Def->getOpcode() == SelOpc &&
6174+
Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0 &&
6175+
Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0) {
6176+
6177+
auto I1 = std::next(MachineBasicBlock::reverse_iterator(Def));
6178+
if (I1 != BB->rend() &&
6179+
(I1->getOpcode() == AddSubOpc || I1->getOpcode() == AddcSubbOpc)) {
6180+
RecalculateSCC = false;
6181+
// Ensure there are no intervening definitions of SCC.
6182+
for (auto I2 = std::next(MachineBasicBlock::reverse_iterator(MI));
6183+
I2 != I1; I2++) {
6184+
if (I2->definesRegister(AMDGPU::SCC, TRI)) {
6185+
RecalculateSCC = true;
6186+
break;
6187+
}
6188+
}
6189+
}
6190+
}
6191+
6192+
if (RecalculateSCC) {
6193+
if (WaveSize == 64) {
6194+
if (ST.hasScalarCompareEq64()) {
6195+
BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6196+
.addReg(Src2.getReg())
6197+
.addImm(0);
6198+
} else {
6199+
const TargetRegisterClass *SubRC =
6200+
TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6201+
MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6202+
MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6203+
MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6204+
MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6205+
Register Src2_32 =
6206+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6207+
6208+
BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6209+
.add(Src2Sub0)
6210+
.add(Src2Sub1);
6211+
6212+
BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6213+
.addReg(Src2_32, RegState::Kill)
6214+
.addImm(0);
6215+
}
6216+
} else {
6217+
BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6218+
.addReg(Src2.getReg())
6219+
.addImm(0);
6220+
}
6221+
}
6222+
6223+
BuildMI(*BB, MII, DL, TII->get(AddcSubbOpc), Dest.getReg())
6224+
.add(Src0)
6225+
.add(Src1);
6226+
>>>>>>> 0cb43743ea30 (Do not generate S_CMP if add/sub carryout is available)
61436227

61446228
BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
61456229
.addImm(-1)
@@ -16588,17 +16672,12 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
1658816672
// LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
1658916673
// setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
1659016674

16591-
// Don't split a 64-bit add/sub into two 32-bit add/sub instructions for
16592-
// non-divergent operations. This can result in lo/hi 32-bit operations
16593-
// being done in SGPR and VGPR with additional operations being needed
16594-
// to move operands and/or generate the intermediate carry.
16595-
if (VT == MVT::i64 && N->isDivergent() &&
16596-
((CC == ISD::SETULT &&
16597-
sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) ||
16598-
(CC == ISD::SETUGT &&
16599-
sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) ||
16600-
(CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16601-
sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16675+
if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16676+
sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) ||
16677+
(CC == ISD::SETUGT &&
16678+
sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) ||
16679+
(CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16680+
sd_match(LHS, m_Add(m_Value(), m_One()))))) {
1660216681
EVT TargetType = MVT::i32;
1660316682
EVT CarryVT = MVT::i1;
1660416683
bool IsAdd = LHS.getOpcode() == ISD::ADD;

0 commit comments

Comments
 (0)