@@ -51,10 +51,10 @@ class SIShrinkInstructions {
5151 unsigned SubReg) const ;
5252 bool instModifiesReg (const MachineInstr *MI, unsigned Reg,
5353 unsigned SubReg) const ;
54- bool trySwitchOperands (MachineInstr &MI, Register *OldVCC,
55- Register *NewVCC) const ;
56- bool shouldSwitchOperands (MachineRegisterInfo &MRI, MachineInstr &MI ,
57- const SIInstrInfo &TII ) const ;
54+ Register trySwapCndOperands (MachineInstr &MI) const ;
55+ bool
56+ shouldSwapCndOperands (MachineInstr &MI, const SIInstrInfo &TII ,
57+ SmallVector<MachineOperand *, 4 > &UsesToProcess ) const ;
5858 unsigned getInverseCompareOpcode (MachineInstr &MI) const ;
5959 TargetInstrInfo::RegSubRegPair getSubRegForIndex (Register Reg, unsigned Sub,
6060 unsigned I) const ;
@@ -851,92 +851,137 @@ unsigned SIShrinkInstructions::getInverseCompareOpcode(MachineInstr &MI) const {
851851 return AMDGPU::V_CMP_LE_U32_e64;
852852 case AMDGPU::V_CMP_LT_U32_e64:
853853 return AMDGPU::V_CMP_GE_U32_e64;
854- // float 32
854+ // unsigned 64
855+ case AMDGPU::V_CMP_EQ_U64_e64:
856+ return AMDGPU::V_CMP_NE_U64_e64;
857+ case AMDGPU::V_CMP_NE_U64_e64:
858+ return AMDGPU::V_CMP_EQ_U64_e64;
859+ case AMDGPU::V_CMP_GE_U64_e64:
860+ return AMDGPU::V_CMP_LT_U64_e64;
861+ case AMDGPU::V_CMP_LE_U64_e64:
862+ return AMDGPU::V_CMP_GT_U64_e64;
863+ case AMDGPU::V_CMP_GT_U64_e64:
864+ return AMDGPU::V_CMP_LE_U64_e64;
865+ case AMDGPU::V_CMP_LT_U64_e64:
866+ return AMDGPU::V_CMP_GE_U64_e64;
867+ // float 32
855868 case AMDGPU::V_CMP_EQ_F32_e64:
856869 return AMDGPU::V_CMP_NEQ_F32_e64;
857870 case AMDGPU::V_CMP_NEQ_F32_e64:
858871 return AMDGPU::V_CMP_EQ_F32_e64;
859872 case AMDGPU::V_CMP_GE_F32_e64:
860- return AMDGPU::V_CMP_LT_F32_e64 ;
873+ return AMDGPU::V_CMP_NGE_F32_e64 ;
861874 case AMDGPU::V_CMP_LE_F32_e64:
862- return AMDGPU::V_CMP_GT_F32_e64 ;
875+ return AMDGPU::V_CMP_NLE_F32_e64 ;
863876 case AMDGPU::V_CMP_GT_F32_e64:
864- return AMDGPU::V_CMP_LE_F32_e64 ;
877+ return AMDGPU::V_CMP_NGT_F32_e64 ;
865878 case AMDGPU::V_CMP_LT_F32_e64:
866- return AMDGPU::V_CMP_GE_F32_e64;
879+ return AMDGPU::V_CMP_NLT_F32_e64;
880+ // float 64
881+ case AMDGPU::V_CMP_EQ_F64_e64:
882+ return AMDGPU::V_CMP_NEQ_F64_e64;
883+ case AMDGPU::V_CMP_NEQ_F64_e64:
884+ return AMDGPU::V_CMP_EQ_F64_e64;
885+ case AMDGPU::V_CMP_GE_F64_e64:
886+ return AMDGPU::V_CMP_NGE_F64_e64;
887+ case AMDGPU::V_CMP_LE_F64_e64:
888+ return AMDGPU::V_CMP_NLE_F64_e64;
889+ case AMDGPU::V_CMP_GT_F64_e64:
890+ return AMDGPU::V_CMP_NGT_F64_e64;
891+ case AMDGPU::V_CMP_LT_F64_e64:
892+ return AMDGPU::V_CMP_NLT_F64_e64;
867893 default :
868894 return 0 ;
869895 }
870896}
871897
872- bool SIShrinkInstructions::shouldSwitchOperands (MachineRegisterInfo &MRI,
873- MachineInstr &MI ,
874- const SIInstrInfo &TII ) const {
875- auto allUses = MRI. use_nodbg_operands (MI.getOperand (5 ).getReg ());
876- unsigned Count = 0 ;
898+ bool SIShrinkInstructions::shouldSwapCndOperands (
899+ MachineInstr &MI, const SIInstrInfo &TII ,
900+ SmallVector<MachineOperand *, 4 > &UsesToProcess ) const {
901+ auto AllUses = MRI-> use_nodbg_operands (MI.getOperand (0 ).getReg ());
902+ bool ShouldSwap = false ;
877903
878- for (auto &Use : allUses) {
879- if (Use.getParent ()->getOpcode () != AMDGPU::V_CNDMASK_B32_e64)
904+ for (auto &Use : AllUses) {
905+ MachineInstr *UseInst = Use.getParent ();
906+ if (UseInst->getOpcode () != AMDGPU::V_CNDMASK_B32_e64)
880907 return false ;
881- MachineOperand *Src0 =
882- TII.getNamedOperand (*Use.getParent (), AMDGPU::OpName::src0);
883- MachineOperand *Src1 =
884- TII.getNamedOperand (*Use.getParent (), AMDGPU::OpName::src1);
908+ MachineOperand *Src0 = TII.getNamedOperand (*UseInst, AMDGPU::OpName::src0);
909+ MachineOperand *Src1 = TII.getNamedOperand (*UseInst, AMDGPU::OpName::src1);
885910
886911 auto Src0Imm = Src0->isImm ();
887912 auto Src1Imm = Src1->isImm ();
888913
889914 if (!Src1Imm && Src0Imm)
890915 return false ;
891- if (Src1Imm && !Src0Imm)
892- Count++;
916+
917+ UsesToProcess.push_back (&Use);
918+
919+ if (Src1Imm && !Src0Imm && !UseInst->getOperand (1 ).getImm ())
920+ ShouldSwap = true ;
893921 }
894- return (Count >= 1 ) ;
922+ return ShouldSwap ;
895923}
896924
897- // OldVCC and NewVCC are used to remember VCC after inverting comparison
898- bool SIShrinkInstructions::trySwitchOperands (MachineInstr &MI, Register *OldVCC,
899- Register *NewVCC) const {
900- const DebugLoc &DL = MI.getDebugLoc ();
901- auto Reg = MI.getOperand (5 ).getReg ();
902- if (!Reg.isVirtual ())
903- return false ;
925+ void swapCndOperands (MachineInstr &MI) {
926+ MachineOperand Op2 = MI.getOperand (2 );
927+ MachineOperand Op4 = MI.getOperand (4 );
928+
929+ if (Op2.isReg ()) {
930+ MI.getOperand (4 ).ChangeToRegister (
931+ Op2.getReg (), Op2.isDef (), Op2.isImplicit (), Op2.isKill (), Op2.isDead (),
932+ Op2.isUndef (), Op2.isDebug ());
933+ if (Op2.getSubReg () != AMDGPU::NoSubRegister)
934+ MI.getOperand (4 ).setSubReg (Op2.getSubReg ());
935+ } else if (Op2.isImm ()) {
936+ MI.getOperand (4 ).ChangeToImmediate (Op2.getImm ());
937+ }
904938
905- if (*OldVCC != Reg) {
906- MachineInstr *DefMI = MRI->getVRegDef (Reg);
907- if (DefMI) {
908- unsigned Opcode = getInverseCompareOpcode (*DefMI);
909- if (Opcode &&
910- SIShrinkInstructions::shouldSwitchOperands (*MRI, MI, *TII)) {
911- auto cmpDL = DefMI->getDebugLoc ();
912- *NewVCC = MRI->createVirtualRegister (MRI->getRegClass (Reg));
913- *OldVCC = Reg;
914- MachineInstrBuilder InverseCompare = BuildMI (
915- *DefMI->getParent (), DefMI, cmpDL, TII->get (Opcode), *NewVCC);
916- InverseCompare->setFlags (DefMI->getFlags ());
917-
918- unsigned OpNum = DefMI->getNumExplicitOperands ();
919- for (unsigned i = 1 ; i < OpNum; i++) {
920- MachineOperand Op = DefMI->getOperand (i);
921- InverseCompare.add (Op);
922- if (Op.isReg () && Op.isKill ())
923- InverseCompare->getOperand (i).setIsKill (false );
924- }
925- }
926- }
939+ if (Op4.isReg ()) {
940+ MI.getOperand (2 ).setReg (Op4.getReg ());
941+ if (Op4.getSubReg () != AMDGPU::NoSubRegister)
942+ MI.getOperand (2 ).setSubReg (Op4.getSubReg ());
943+ } else if (Op4.isImm ()) {
944+ MI.getOperand (2 ).ChangeToImmediate (Op4.getImm ());
927945 }
928- if (*OldVCC == Reg) {
929- BuildMI (*MI.getParent (), MI, DL, TII->get (AMDGPU::V_CNDMASK_B32_e64),
930- MI.getOperand (0 ).getReg ())
931- .add (MI.getOperand (3 ))
932- .add (MI.getOperand (4 ))
933- .add (MI.getOperand (1 ))
934- .add (MI.getOperand (2 ))
935- .addReg (*NewVCC);
936- MI.eraseFromParent ();
937- return true ;
946+
947+ MachineOperand Op1 = MI.getOperand (1 );
948+ MachineOperand Op3 = MI.getOperand (3 );
949+ MI.getOperand (1 ).setImm (Op3.getImm ());
950+ MI.getOperand (3 ).setImm (Op1.getImm ());
951+ }
952+
953+ Register SIShrinkInstructions::trySwapCndOperands (MachineInstr &MI) const {
954+ Register Reg = MI.getOperand (0 ).getReg ();
955+
956+ unsigned Opcode = getInverseCompareOpcode (MI);
957+ SmallVector<MachineOperand *, 4 > UsesToProcess;
958+ if (!Opcode ||
959+ !SIShrinkInstructions::shouldSwapCndOperands (MI, *TII, UsesToProcess))
960+ return AMDGPU::NoRegister;
961+
962+ auto DL = MI.getDebugLoc ();
963+ Register NewVCC = MRI->createVirtualRegister (MRI->getRegClass (Reg));
964+
965+ MachineInstrBuilder InverseCompare =
966+ BuildMI (*MI.getParent (), MI, DL, TII->get (Opcode), NewVCC);
967+ InverseCompare->setFlags (MI.getFlags ());
968+
969+ unsigned OpNum = MI.getNumExplicitOperands ();
970+ for (unsigned i = 1 ; i < OpNum; i++) {
971+ MachineOperand Op = MI.getOperand (i);
972+ InverseCompare.add (Op);
973+ if (Op.isReg () && Op.isKill ())
974+ InverseCompare->getOperand (i).setIsKill (false );
938975 }
939- return false ;
976+
977+ for (auto &Use : UsesToProcess) {
978+ MachineInstr *Inst = Use->getParent ();
979+ swapCndOperands (*Inst);
980+ }
981+
982+ MRI->replaceRegWith (Reg, NewVCC);
983+ MI.eraseFromParent ();
984+ return NewVCC;
940985}
941986
942987bool SIShrinkInstructions::run (MachineFunction &MF) {
@@ -950,9 +995,6 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
950995 unsigned VCCReg = ST->isWave32 () ? AMDGPU::VCC_LO : AMDGPU::VCC;
951996
952997 std::vector<unsigned > I1Defs;
953- Register OldVCC = AMDGPU::NoRegister;
954- Register NewVCC = AMDGPU::NoRegister;
955-
956998 for (MachineFunction::iterator BI = MF.begin (), BE = MF.end ();
957999 BI != BE; ++BI) {
9581000
@@ -1083,10 +1125,6 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
10831125 continue ;
10841126 }
10851127
1086- if (MI.getOpcode () == AMDGPU::V_CNDMASK_B32_e64 &&
1087- trySwitchOperands (MI, &OldVCC, &NewVCC))
1088- MRI->setRegAllocationHint (NewVCC, 0 , VCCReg);
1089-
10901128 // If there is no chance we will shrink it and use VCC as sdst to get
10911129 // a 32 bit form try to replace dead sdst with NULL.
10921130 if (TII->isVOP3 (MI.getOpcode ())) {
@@ -1124,6 +1162,9 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
11241162 // provide a hint to the register allocator to use VCC and then we
11251163 // will run this pass again after RA and shrink it if it outputs to
11261164 // VCC.
1165+ Register NewVCC = trySwapCndOperands (MI);
1166+ DstReg = NewVCC == AMDGPU::NoRegister ? DstReg : NewVCC;
1167+
11271168 MRI->setRegAllocationHint (DstReg, 0 , VCCReg);
11281169 continue ;
11291170 }
0 commit comments