@@ -6110,81 +6110,34 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
6110
6110
unsigned SelectOpc =
6111
6111
(WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6112
6112
unsigned AddcSubbOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
6113
- unsigned AddSubOpc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32;
6114
- // Lowering for:
6115
- //
6116
- // S_UADDO_PSEUDO|S_ADD_CO_PSEUDO
6117
- // <no SCC def code>
6118
- // S_ADD_CO_PSEUDO
6119
- //
6120
- // produces:
6121
- //
6122
- // S_ADD_I32|S_ADDC_U32 ; lowered from S_UADDO_PSEUDO
6123
- // SREG = S_CSELECT_B32|64 [1,-1], 0 ; lowered from S_UADDO_PSEUDO
6124
- // <no SCC def code>
6125
- // S_CMP32|64 SREG, 0 ; lowered from S_ADD_CO_PSEUDO
6126
- // S_ADDC_U32 ; lowered from S_ADD_CO_PSEUDO
6127
- //
6128
- // At this point before generating the S_CMP check if it is redundant. If
6129
- // so do not recalculate it. Subsequent optimizations will also delete the
6130
- // dead S_CSELECT*.
6131
-
6132
- bool RecalculateSCC{true};
6133
- MachineInstr *SelectDef = MRI.getVRegDef(Src2.getReg());
6134
- if (SelectDef && SelectDef->getParent() == BB &&
6135
- SelectDef->getOpcode() == SelectOpc &&
6136
- SelectDef->getOperand(1).isImm() &&
6137
- SelectDef->getOperand(1).getImm() != 0 &&
6138
- SelectDef->getOperand(2).isImm() &&
6139
- SelectDef->getOperand(2).getImm() == 0) {
6140
- auto I1 = std::next(MachineBasicBlock::reverse_iterator(SelectDef));
6141
- if (I1 != BB->rend() &&
6142
- (I1->getOpcode() == AddSubOpc || I1->getOpcode() == AddcSubbOpc)) {
6143
- // Ensure there are no intervening definitions of SCC between ADDs/SUBs
6144
- const unsigned SearchLimit = 6;
6145
- unsigned Count = 0;
6146
- for (auto I2 = std::next(MachineBasicBlock::reverse_iterator(MI));
6147
- Count < SearchLimit; I2++, Count++) {
6148
- if (I2 == I1) {
6149
- RecalculateSCC = false;
6150
- break;
6151
- }
6152
- if (I2->definesRegister(AMDGPU::SCC, TRI))
6153
- break;
6154
- }
6155
- }
6156
- }
6157
6113
6158
- if (RecalculateSCC) {
6159
- if (WaveSize == 64) {
6160
- if (ST.hasScalarCompareEq64()) {
6161
- BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6162
- .addReg(Src2.getReg())
6163
- .addImm(0);
6164
- } else {
6165
- const TargetRegisterClass *SubRC =
6166
- TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6167
- MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6168
- MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6169
- MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6170
- MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6171
- Register Src2_32 =
6172
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6173
-
6174
- BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6175
- .add(Src2Sub0)
6176
- .add(Src2Sub1);
6114
+ if (WaveSize == 64) {
6115
+ if (ST.hasScalarCompareEq64()) {
6116
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6117
+ .addReg(Src2.getReg())
6118
+ .addImm(0);
6119
+ } else {
6120
+ const TargetRegisterClass *SubRC =
6121
+ TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6122
+ MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6123
+ MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6124
+ MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6125
+ MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6126
+ Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6127
+
6128
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6129
+ .add(Src2Sub0)
6130
+ .add(Src2Sub1);
6177
6131
6178
- BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6179
- .addReg(Src2_32, RegState::Kill)
6180
- .addImm(0);
6181
- }
6132
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6133
+ .addReg(Src2_32, RegState::Kill)
6134
+ .addImm(0);
6135
+ }
6182
6136
} else {
6183
6137
BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6184
6138
.addReg(Src2.getReg())
6185
6139
.addImm(0);
6186
6140
}
6187
- }
6188
6141
6189
6142
BuildMI(*BB, MII, DL, TII->get(AddcSubbOpc), Dest.getReg())
6190
6143
.add(Src0)
0 commit comments