llvm
diff --git a/‎llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp‎
Lines changed: 111 additions & 70 deletions b/‎llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp‎
Lines changed: 111 additions & 70 deletions
@@ -51,10 +51,10 @@ class SIShrinkInstructions {
                     unsigned SubReg) const;
   bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
                        unsigned SubReg) const;
-  bool trySwitchOperands(MachineInstr &MI, Register *OldVCC,
-                         Register *NewVCC) const;
-  bool shouldSwitchOperands(MachineRegisterInfo &MRI, MachineInstr &MI,
-                            const SIInstrInfo &TII) const;
+  Register trySwapCndOperands(MachineInstr &MI) const;
+  bool
+  shouldSwapCndOperands(MachineInstr &MI, const SIInstrInfo &TII,
+                        SmallVector<MachineOperand *, 4> &UsesToProcess) const;
   unsigned getInverseCompareOpcode(MachineInstr &MI) const;
   TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
                                                    unsigned I) const;
@@ -851,92 +851,137 @@ unsigned SIShrinkInstructions::getInverseCompareOpcode(MachineInstr &MI) const {
     return AMDGPU::V_CMP_LE_U32_e64;
   case AMDGPU::V_CMP_LT_U32_e64:
     return AMDGPU::V_CMP_GE_U32_e64;
-    // float 32
+    // unsigned 64
+  case AMDGPU::V_CMP_EQ_U64_e64:
+    return AMDGPU::V_CMP_NE_U64_e64;
+  case AMDGPU::V_CMP_NE_U64_e64:
+    return AMDGPU::V_CMP_EQ_U64_e64;
+  case AMDGPU::V_CMP_GE_U64_e64:
+    return AMDGPU::V_CMP_LT_U64_e64;
+  case AMDGPU::V_CMP_LE_U64_e64:
+    return AMDGPU::V_CMP_GT_U64_e64;
+  case AMDGPU::V_CMP_GT_U64_e64:
+    return AMDGPU::V_CMP_LE_U64_e64;
+  case AMDGPU::V_CMP_LT_U64_e64:
+    return AMDGPU::V_CMP_GE_U64_e64;
+  // float 32
   case AMDGPU::V_CMP_EQ_F32_e64:
     return AMDGPU::V_CMP_NEQ_F32_e64;
   case AMDGPU::V_CMP_NEQ_F32_e64:
     return AMDGPU::V_CMP_EQ_F32_e64;
   case AMDGPU::V_CMP_GE_F32_e64:
-    return AMDGPU::V_CMP_LT_F32_e64;
+    return AMDGPU::V_CMP_NGE_F32_e64;
   case AMDGPU::V_CMP_LE_F32_e64:
-    return AMDGPU::V_CMP_GT_F32_e64;
+    return AMDGPU::V_CMP_NLE_F32_e64;
   case AMDGPU::V_CMP_GT_F32_e64:
-    return AMDGPU::V_CMP_LE_F32_e64;
+    return AMDGPU::V_CMP_NGT_F32_e64;
   case AMDGPU::V_CMP_LT_F32_e64:
-    return AMDGPU::V_CMP_GE_F32_e64;
+    return AMDGPU::V_CMP_NLT_F32_e64;
+  // float 64
+  case AMDGPU::V_CMP_EQ_F64_e64:
+    return AMDGPU::V_CMP_NEQ_F64_e64;
+  case AMDGPU::V_CMP_NEQ_F64_e64:
+    return AMDGPU::V_CMP_EQ_F64_e64;
+  case AMDGPU::V_CMP_GE_F64_e64:
+    return AMDGPU::V_CMP_NGE_F64_e64;
+  case AMDGPU::V_CMP_LE_F64_e64:
+    return AMDGPU::V_CMP_NLE_F64_e64;
+  case AMDGPU::V_CMP_GT_F64_e64:
+    return AMDGPU::V_CMP_NGT_F64_e64;
+  case AMDGPU::V_CMP_LT_F64_e64:
+    return AMDGPU::V_CMP_NLT_F64_e64;
   default:
     return 0;
   }
 }
 
-bool SIShrinkInstructions::shouldSwitchOperands(MachineRegisterInfo &MRI,
-                                                MachineInstr &MI,
-                                                const SIInstrInfo &TII) const {
-  auto allUses = MRI.use_nodbg_operands(MI.getOperand(5).getReg());
-  unsigned Count = 0;
+bool SIShrinkInstructions::shouldSwapCndOperands(
+    MachineInstr &MI, const SIInstrInfo &TII,
+    SmallVector<MachineOperand *, 4> &UsesToProcess) const {
+  auto AllUses = MRI->use_nodbg_operands(MI.getOperand(0).getReg());
+  bool ShouldSwap = false;
 
-  for (auto &Use : allUses) {
-    if (Use.getParent()->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+  for (auto &Use : AllUses) {
+    MachineInstr *UseInst = Use.getParent();
+    if (UseInst->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
       return false;
-    MachineOperand *Src0 =
-        TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src0);
-    MachineOperand *Src1 =
-        TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src1);
+    MachineOperand *Src0 = TII.getNamedOperand(*UseInst, AMDGPU::OpName::src0);
+    MachineOperand *Src1 = TII.getNamedOperand(*UseInst, AMDGPU::OpName::src1);
 
     auto Src0Imm = Src0->isImm();
     auto Src1Imm = Src1->isImm();
 
     if (!Src1Imm && Src0Imm)
       return false;
-    if (Src1Imm && !Src0Imm)
-      Count++;
+
+    UsesToProcess.push_back(&Use);
+
+    if (Src1Imm && !Src0Imm && !UseInst->getOperand(1).getImm())
+      ShouldSwap = true;
   }
-  return (Count >= 1);
+  return ShouldSwap;
 }
 
-// OldVCC and NewVCC are used to remember VCC after inverting comparison
-bool SIShrinkInstructions::trySwitchOperands(MachineInstr &MI, Register *OldVCC,
-                                             Register *NewVCC) const {
-  const DebugLoc &DL = MI.getDebugLoc();
-  auto Reg = MI.getOperand(5).getReg();
-  if (!Reg.isVirtual())
-    return false;
+void swapCndOperands(MachineInstr &MI) {
+  MachineOperand Op2 = MI.getOperand(2);
+  MachineOperand Op4 = MI.getOperand(4);
+
+  if (Op2.isReg()) {
+    MI.getOperand(4).ChangeToRegister(
+        Op2.getReg(), Op2.isDef(), Op2.isImplicit(), Op2.isKill(), Op2.isDead(),
+        Op2.isUndef(), Op2.isDebug());
+    if (Op2.getSubReg() != AMDGPU::NoSubRegister)
+      MI.getOperand(4).setSubReg(Op2.getSubReg());
+  } else if (Op2.isImm()) {
+    MI.getOperand(4).ChangeToImmediate(Op2.getImm());
+  }
 
-  if (*OldVCC != Reg) {
-    MachineInstr *DefMI = MRI->getVRegDef(Reg);
-    if (DefMI) {
-      unsigned Opcode = getInverseCompareOpcode(*DefMI);
-      if (Opcode &&
-          SIShrinkInstructions::shouldSwitchOperands(*MRI, MI, *TII)) {
-        auto cmpDL = DefMI->getDebugLoc();
-        *NewVCC = MRI->createVirtualRegister(MRI->getRegClass(Reg));
-        *OldVCC = Reg;
-        MachineInstrBuilder InverseCompare = BuildMI(
-            *DefMI->getParent(), DefMI, cmpDL, TII->get(Opcode), *NewVCC);
-        InverseCompare->setFlags(DefMI->getFlags());
-
-        unsigned OpNum = DefMI->getNumExplicitOperands();
-        for (unsigned i = 1; i < OpNum; i++) {
-          MachineOperand Op = DefMI->getOperand(i);
-          InverseCompare.add(Op);
-          if (Op.isReg() && Op.isKill())
-            InverseCompare->getOperand(i).setIsKill(false);
-        }
-      }
-    }
+  if (Op4.isReg()) {
+    MI.getOperand(2).setReg(Op4.getReg());
+    if (Op4.getSubReg() != AMDGPU::NoSubRegister)
+      MI.getOperand(2).setSubReg(Op4.getSubReg());
+  } else if (Op4.isImm()) {
+    MI.getOperand(2).ChangeToImmediate(Op4.getImm());
   }
-  if (*OldVCC == Reg) {
-    BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64),
-            MI.getOperand(0).getReg())
-        .add(MI.getOperand(3))
-        .add(MI.getOperand(4))
-        .add(MI.getOperand(1))
-        .add(MI.getOperand(2))
-        .addReg(*NewVCC);
-    MI.eraseFromParent();
-    return true;
+
+  MachineOperand Op1 = MI.getOperand(1);
+  MachineOperand Op3 = MI.getOperand(3);
+  MI.getOperand(1).setImm(Op3.getImm());
+  MI.getOperand(3).setImm(Op1.getImm());
+}
+
+Register SIShrinkInstructions::trySwapCndOperands(MachineInstr &MI) const {
+  Register Reg = MI.getOperand(0).getReg();
+
+  unsigned Opcode = getInverseCompareOpcode(MI);
+  SmallVector<MachineOperand *, 4> UsesToProcess;
+  if (!Opcode ||
+      !SIShrinkInstructions::shouldSwapCndOperands(MI, *TII, UsesToProcess))
+    return AMDGPU::NoRegister;
+
+  auto DL = MI.getDebugLoc();
+  Register NewVCC = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+
+  MachineInstrBuilder InverseCompare =
+      BuildMI(*MI.getParent(), MI, DL, TII->get(Opcode), NewVCC);
+  InverseCompare->setFlags(MI.getFlags());
+
+  unsigned OpNum = MI.getNumExplicitOperands();
+  for (unsigned i = 1; i < OpNum; i++) {
+    MachineOperand Op = MI.getOperand(i);
+    InverseCompare.add(Op);
+    if (Op.isReg() && Op.isKill())
+      InverseCompare->getOperand(i).setIsKill(false);
   }
-  return false;
+
+  for (auto &Use : UsesToProcess) {
+    MachineInstr *Inst = Use->getParent();
+    swapCndOperands(*Inst);
+  }
+
+  MRI->replaceRegWith(Reg, NewVCC);
+  MI.eraseFromParent();
+  return NewVCC;
 }
 
 bool SIShrinkInstructions::run(MachineFunction &MF) {
@@ -950,9 +995,6 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
   unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
 
   std::vector<unsigned> I1Defs;
-  Register OldVCC = AMDGPU::NoRegister;
-  Register NewVCC = AMDGPU::NoRegister;
-
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
 
@@ -1083,10 +1125,6 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
         continue;
       }
 
-      if (MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
-          trySwitchOperands(MI, &OldVCC, &NewVCC))
-        MRI->setRegAllocationHint(NewVCC, 0, VCCReg);
-
       // If there is no chance we will shrink it and use VCC as sdst to get
       // a 32 bit form try to replace dead sdst with NULL.
       if (TII->isVOP3(MI.getOpcode())) {
@@ -1124,6 +1162,9 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
             // provide a hint to the register allocator to use VCC and then we
             // will run this pass again after RA and shrink it if it outputs to
             // VCC.
+            Register NewVCC = trySwapCndOperands(MI);
+            DstReg = NewVCC == AMDGPU::NoRegister ? DstReg : NewVCC;
+
             MRI->setRegAllocationHint(DstReg, 0, VCCReg);
             continue;
           }