@@ -6081,9 +6081,9 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
60816081 MachineOperand &Src0 = MI.getOperand(2);
60826082 MachineOperand &Src1 = MI.getOperand(3);
60836083 MachineOperand &Src2 = MI.getOperand(4);
6084- unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6085- ? AMDGPU::S_ADDC_U32
6086- : AMDGPU::S_SUBB_U32;
6084+
6085+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO);
6086+
60876087 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
60886088 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
60896089 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
@@ -6103,6 +6103,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
61036103 Src2.setReg(RegOp2);
61046104 }
61056105
6106+ <<<<<<< HEAD
61066107 if (ST.isWave64()) {
61076108 if (ST.hasScalarCompareEq64()) {
61086109 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
@@ -6140,6 +6141,89 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
61406141
61416142 unsigned SelOpc =
61426143 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6144+ =======
6145+ const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6146+ unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
6147+ assert(WaveSize == 64 || WaveSize == 32);
6148+
6149+ unsigned SelOpc =
6150+ (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6151+ unsigned AddcSubbOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
6152+ unsigned AddSubOpc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32;
6153+ // Lowering for:
6154+ //
6155+ // S_UADDO_PSEUDO|S_ADD_CO_PSEUDO
6156+ // <no SCC def code>
6157+ // S_ADD_CO_PSEUDO
6158+ //
6159+ // produces:
6160+ //
6161+ // S_ADD_I32|S_ADDC_U32 ; lowered from S_UADDO_PSEUDO
6162+ // SREG = S_CSELECT_B32|64 [1,-1], 0 ; lowered from S_UADDO_PSEUDO
6163+ // <no SCC def code>
6164+ // S_CMP32|64 SREG, 0 ; lowered from S_ADD_CO_PSEUDO
6165+ // S_ADDC_U32 ; lowered from S_ADD_CO_PSEUDO
6166+ //
6167+ // At this point before generating the S_CMP check if it is redundant. If
6168+ // so do not recalculate it. Subsequent optimizations will also delete the
6169+ // dead S_CSELECT*.
6170+
6171+ bool RecalculateSCC{true};
6172+ MachineInstr *Def = MRI.getVRegDef(Src2.getReg());
6173+ if (Def && Def->getParent() == BB && Def->getOpcode() == SelOpc &&
6174+ Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0 &&
6175+ Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0) {
6176+
6177+ auto I1 = std::next(MachineBasicBlock::reverse_iterator(Def));
6178+ if (I1 != BB->rend() &&
6179+ (I1->getOpcode() == AddSubOpc || I1->getOpcode() == AddcSubbOpc)) {
6180+ RecalculateSCC = false;
6181+ // Ensure there are no intervening definitions of SCC.
6182+ for (auto I2 = std::next(MachineBasicBlock::reverse_iterator(MI));
6183+ I2 != I1; I2++) {
6184+ if (I2->definesRegister(AMDGPU::SCC, TRI)) {
6185+ RecalculateSCC = true;
6186+ break;
6187+ }
6188+ }
6189+ }
6190+ }
6191+
6192+ if (RecalculateSCC) {
6193+ if (WaveSize == 64) {
6194+ if (ST.hasScalarCompareEq64()) {
6195+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6196+ .addReg(Src2.getReg())
6197+ .addImm(0);
6198+ } else {
6199+ const TargetRegisterClass *SubRC =
6200+ TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6201+ MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6202+ MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6203+ MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6204+ MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6205+ Register Src2_32 =
6206+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6207+
6208+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6209+ .add(Src2Sub0)
6210+ .add(Src2Sub1);
6211+
6212+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6213+ .addReg(Src2_32, RegState::Kill)
6214+ .addImm(0);
6215+ }
6216+ } else {
6217+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6218+ .addReg(Src2.getReg())
6219+ .addImm(0);
6220+ }
6221+ }
6222+
6223+ BuildMI(*BB, MII, DL, TII->get(AddcSubbOpc), Dest.getReg())
6224+ .add(Src0)
6225+ .add(Src1);
6226+ >>>>>>> 0cb43743ea30 (Do not generate S_CMP if add/sub carryout is available)
61436227
61446228 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
61456229 .addImm(-1)
@@ -16588,17 +16672,12 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
1658816672 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
1658916673 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
1659016674
16591- // Don't split a 64-bit add/sub into two 32-bit add/sub instructions for
16592- // non-divergent operations. This can result in lo/hi 32-bit operations
16593- // being done in SGPR and VGPR with additional operations being needed
16594- // to move operands and/or generate the intermediate carry.
16595- if (VT == MVT::i64 && N->isDivergent() &&
16596- ((CC == ISD::SETULT &&
16597- sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) ||
16598- (CC == ISD::SETUGT &&
16599- sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) ||
16600- (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16601- sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16675+ if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16676+ sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) ||
16677+ (CC == ISD::SETUGT &&
16678+ sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) ||
16679+ (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16680+ sd_match(LHS, m_Add(m_Value(), m_One()))))) {
1660216681 EVT TargetType = MVT::i32;
1660316682 EVT CarryVT = MVT::i1;
1660416683 bool IsAdd = LHS.getOpcode() == ISD::ADD;
0 commit comments