@@ -6110,81 +6110,34 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
61106110 unsigned SelectOpc =
61116111 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
61126112 unsigned AddcSubbOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
6113- unsigned AddSubOpc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32;
6114- // Lowering for:
6115- //
6116- // S_UADDO_PSEUDO|S_ADD_CO_PSEUDO
6117- // <no SCC def code>
6118- // S_ADD_CO_PSEUDO
6119- //
6120- // produces:
6121- //
6122- // S_ADD_I32|S_ADDC_U32 ; lowered from S_UADDO_PSEUDO
6123- // SREG = S_CSELECT_B32|64 [1,-1], 0 ; lowered from S_UADDO_PSEUDO
6124- // <no SCC def code>
6125- // S_CMP32|64 SREG, 0 ; lowered from S_ADD_CO_PSEUDO
6126- // S_ADDC_U32 ; lowered from S_ADD_CO_PSEUDO
6127- //
6128- // At this point before generating the S_CMP check if it is redundant. If
6129- // so do not recalculate it. Subsequent optimizations will also delete the
6130- // dead S_CSELECT*.
6131-
6132- bool RecalculateSCC{true};
6133- MachineInstr *SelectDef = MRI.getVRegDef(Src2.getReg());
6134- if (SelectDef && SelectDef->getParent() == BB &&
6135- SelectDef->getOpcode() == SelectOpc &&
6136- SelectDef->getOperand(1).isImm() &&
6137- SelectDef->getOperand(1).getImm() != 0 &&
6138- SelectDef->getOperand(2).isImm() &&
6139- SelectDef->getOperand(2).getImm() == 0) {
6140- auto I1 = std::next(MachineBasicBlock::reverse_iterator(SelectDef));
6141- if (I1 != BB->rend() &&
6142- (I1->getOpcode() == AddSubOpc || I1->getOpcode() == AddcSubbOpc)) {
6143- // Ensure there are no intervening definitions of SCC between ADDs/SUBs
6144- const unsigned SearchLimit = 6;
6145- unsigned Count = 0;
6146- for (auto I2 = std::next(MachineBasicBlock::reverse_iterator(MI));
6147- Count < SearchLimit; I2++, Count++) {
6148- if (I2 == I1) {
6149- RecalculateSCC = false;
6150- break;
6151- }
6152- if (I2->definesRegister(AMDGPU::SCC, TRI))
6153- break;
6154- }
6155- }
6156- }
61576113
6158- if (RecalculateSCC) {
6159- if (WaveSize == 64) {
6160- if (ST.hasScalarCompareEq64()) {
6161- BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6162- .addReg(Src2.getReg())
6163- .addImm(0);
6164- } else {
6165- const TargetRegisterClass *SubRC =
6166- TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6167- MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6168- MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6169- MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6170- MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6171- Register Src2_32 =
6172- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6173-
6174- BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6175- .add(Src2Sub0)
6176- .add(Src2Sub1);
6114+ if (WaveSize == 64) {
6115+ if (ST.hasScalarCompareEq64()) {
6116+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6117+ .addReg(Src2.getReg())
6118+ .addImm(0);
6119+ } else {
6120+ const TargetRegisterClass *SubRC =
6121+ TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6122+ MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6123+ MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6124+ MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6125+ MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6126+ Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6127+
6128+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6129+ .add(Src2Sub0)
6130+ .add(Src2Sub1);
61776131
6178- BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6179- .addReg(Src2_32, RegState::Kill)
6180- .addImm(0);
6181- }
6132+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6133+ .addReg(Src2_32, RegState::Kill)
6134+ .addImm(0);
6135+ }
61826136 } else {
61836137 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
61846138 .addReg(Src2.getReg())
61856139 .addImm(0);
61866140 }
6187- }
61886141
61896142 BuildMI(*BB, MII, DL, TII->get(AddcSubbOpc), Dest.getReg())
61906143 .add(Src0)
0 commit comments