@@ -6081,9 +6081,9 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
6081
6081
MachineOperand &Src0 = MI.getOperand(2);
6082
6082
MachineOperand &Src1 = MI.getOperand(3);
6083
6083
MachineOperand &Src2 = MI.getOperand(4);
6084
- unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6085
- ? AMDGPU::S_ADDC_U32
6086
- : AMDGPU::S_SUBB_U32;
6084
+
6085
+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO);
6086
+
6087
6087
if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6088
6088
Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6089
6089
BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
@@ -6103,6 +6103,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
6103
6103
Src2.setReg(RegOp2);
6104
6104
}
6105
6105
6106
+ <<<<<<< HEAD
6106
6107
if (ST.isWave64()) {
6107
6108
if (ST.hasScalarCompareEq64()) {
6108
6109
BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
@@ -6140,6 +6141,89 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
6140
6141
6141
6142
unsigned SelOpc =
6142
6143
ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6144
+ =======
6145
+ const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6146
+ unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
6147
+ assert(WaveSize == 64 || WaveSize == 32);
6148
+
6149
+ unsigned SelOpc =
6150
+ (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6151
+ unsigned AddcSubbOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
6152
+ unsigned AddSubOpc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32;
6153
+ // Lowering for:
6154
+ //
6155
+ // S_UADDO_PSEUDO|S_ADD_CO_PSEUDO
6156
+ // <no SCC def code>
6157
+ // S_ADD_CO_PSEUDO
6158
+ //
6159
+ // produces:
6160
+ //
6161
+ // S_ADD_I32|S_ADDC_U32 ; lowered from S_UADDO_PSEUDO
6162
+ // SREG = S_CSELECT_B32|64 [1,-1], 0 ; lowered from S_UADDO_PSEUDO
6163
+ // <no SCC def code>
6164
+ // S_CMP32|64 SREG, 0 ; lowered from S_ADD_CO_PSEUDO
6165
+ // S_ADDC_U32 ; lowered from S_ADD_CO_PSEUDO
6166
+ //
6167
+ // At this point before generating the S_CMP check if it is redundant. If
6168
+ // so do not recalculate it. Subsequent optimizations will also delete the
6169
+ // dead S_CSELECT*.
6170
+
6171
+ bool RecalculateSCC{true};
6172
+ MachineInstr *Def = MRI.getVRegDef(Src2.getReg());
6173
+ if (Def && Def->getParent() == BB && Def->getOpcode() == SelOpc &&
6174
+ Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0 &&
6175
+ Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0) {
6176
+
6177
+ auto I1 = std::next(MachineBasicBlock::reverse_iterator(Def));
6178
+ if (I1 != BB->rend() &&
6179
+ (I1->getOpcode() == AddSubOpc || I1->getOpcode() == AddcSubbOpc)) {
6180
+ RecalculateSCC = false;
6181
+ // Ensure there are no intervening definitions of SCC.
6182
+ for (auto I2 = std::next(MachineBasicBlock::reverse_iterator(MI));
6183
+ I2 != I1; I2++) {
6184
+ if (I2->definesRegister(AMDGPU::SCC, TRI)) {
6185
+ RecalculateSCC = true;
6186
+ break;
6187
+ }
6188
+ }
6189
+ }
6190
+ }
6191
+
6192
+ if (RecalculateSCC) {
6193
+ if (WaveSize == 64) {
6194
+ if (ST.hasScalarCompareEq64()) {
6195
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6196
+ .addReg(Src2.getReg())
6197
+ .addImm(0);
6198
+ } else {
6199
+ const TargetRegisterClass *SubRC =
6200
+ TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6201
+ MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6202
+ MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6203
+ MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6204
+ MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6205
+ Register Src2_32 =
6206
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6207
+
6208
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6209
+ .add(Src2Sub0)
6210
+ .add(Src2Sub1);
6211
+
6212
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6213
+ .addReg(Src2_32, RegState::Kill)
6214
+ .addImm(0);
6215
+ }
6216
+ } else {
6217
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6218
+ .addReg(Src2.getReg())
6219
+ .addImm(0);
6220
+ }
6221
+ }
6222
+
6223
+ BuildMI(*BB, MII, DL, TII->get(AddcSubbOpc), Dest.getReg())
6224
+ .add(Src0)
6225
+ .add(Src1);
6226
+ >>>>>>> 0cb43743ea30 (Do not generate S_CMP if add/sub carryout is available)
6143
6227
6144
6228
BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6145
6229
.addImm(-1)
@@ -16588,17 +16672,12 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16588
16672
// LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
16589
16673
// setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
16590
16674
16591
- // Don't split a 64-bit add/sub into two 32-bit add/sub instructions for
16592
- // non-divergent operations. This can result in lo/hi 32-bit operations
16593
- // being done in SGPR and VGPR with additional operations being needed
16594
- // to move operands and/or generate the intermediate carry.
16595
- if (VT == MVT::i64 && N->isDivergent() &&
16596
- ((CC == ISD::SETULT &&
16597
- sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) ||
16598
- (CC == ISD::SETUGT &&
16599
- sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) ||
16600
- (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16601
- sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16675
+ if (VT == MVT::i64 && ((CC == ISD::SETULT &&
16676
+ sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) ||
16677
+ (CC == ISD::SETUGT &&
16678
+ sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) ||
16679
+ (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
16680
+ sd_match(LHS, m_Add(m_Value(), m_One()))))) {
16602
16681
EVT TargetType = MVT::i32;
16603
16682
EVT CarryVT = MVT::i1;
16604
16683
bool IsAdd = LHS.getOpcode() == ISD::ADD;
0 commit comments