draperlaboratory
diff --git a/‎llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
Lines changed: 7 additions & 228 deletions b/‎llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
Lines changed: 7 additions & 228 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Lines changed: 14 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Lines changed: 14 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstructions.td
Lines changed: 5 additions & 5 deletions b/‎llvm/lib/Target/AMDGPU/SIInstructions.td
Lines changed: 5 additions & 5 deletions
@@ -44,19 +44,18 @@ class SIInsertSkips : public MachineFunctionPass {
   bool shouldSkip(const MachineBasicBlock &From,
                   const MachineBasicBlock &To) const;
 
-  bool dominatesAllReachable(MachineBasicBlock &MBB);
   void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec);
-  void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                  DebugLoc DL);
 
-  bool kill(MachineInstr &MI);
   void earlyTerm(MachineInstr &MI);
 
   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
 
 public:
   static char ID;
 
+  unsigned MovOpc;
+  Register ExecReg;
+
   SIInsertSkips() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -138,15 +137,6 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
   return false;
 }
 
-/// Check whether \p MBB dominates all blocks that are reachable from it.
-bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
-  for (MachineBasicBlock *Other : depth_first(&MBB)) {
-    if (!MDT->dominates(&MBB, Other))
-      return false;
-  }
-  return true;
-}
-
 static void generateEndPgm(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I, DebugLoc DL,
                            const SIInstrInfo *TII, bool IsPS) {
@@ -181,11 +171,8 @@ void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB,
   }
 
   if (ClearExec && !EarlyExitClearsExec) {
-    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-    unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
     auto ExitI = EarlyExitBlock->getFirstNonPHI();
-    BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0);
+    BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(MovOpc), ExecReg).addImm(0);
     EarlyExitClearsExec = true;
   }
 }
@@ -205,175 +192,6 @@ static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
   MDT->getBase().applyUpdates(DTUpdates);
 }
 
-/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
-/// iterator. Only applies to pixel shaders.
-void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I, DebugLoc DL) {
-  MachineFunction *MF = MBB.getParent();
-  (void)MF;
-  assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
-
-  // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
-  // basic block that has no further successors (e.g., there was an
-  // `unreachable` there in IR). This can happen with original source of the
-  // form:
-  //
-  //   if (uniform_condition) {
-  //     write_to_memory();
-  //     discard;
-  //   }
-  //
-  // In this case, we write the "null_export; s_endpgm" skip code in the
-  // already-existing basic block.
-  auto NextBBI = std::next(MBB.getIterator());
-  bool NoSuccessor =
-      I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI);
-
-  if (NoSuccessor) {
-    generateEndPgm(MBB, I, DL, TII, true);
-  } else {
-    ensureEarlyExitBlock(MBB, false);
-
-    MachineInstr *BranchMI =
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
-            .addMBB(EarlyExitBlock);
-
-    // Split the block if the branch will not come at the end.
-    auto Next = std::next(BranchMI->getIterator());
-    if (Next != MBB.end() && !Next->isTerminator())
-      splitBlock(MBB, *BranchMI, MDT);
-
-    MBB.addSuccessor(EarlyExitBlock);
-    MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
-  }
-}
-
-/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
-/// Return true unless the terminator is a no-op.
-bool SIInsertSkips::kill(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-
-  switch (MI.getOpcode()) {
-  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
-    unsigned Opcode = 0;
-
-    // The opcodes are inverted because the inline immediate has to be
-    // the first operand, e.g. from "x < imm" to "imm > x"
-    switch (MI.getOperand(2).getImm()) {
-    case ISD::SETOEQ:
-    case ISD::SETEQ:
-      Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
-      break;
-    case ISD::SETOGT:
-    case ISD::SETGT:
-      Opcode = AMDGPU::V_CMPX_LT_F32_e64;
-      break;
-    case ISD::SETOGE:
-    case ISD::SETGE:
-      Opcode = AMDGPU::V_CMPX_LE_F32_e64;
-      break;
-    case ISD::SETOLT:
-    case ISD::SETLT:
-      Opcode = AMDGPU::V_CMPX_GT_F32_e64;
-      break;
-    case ISD::SETOLE:
-    case ISD::SETLE:
-      Opcode = AMDGPU::V_CMPX_GE_F32_e64;
-      break;
-    case ISD::SETONE:
-    case ISD::SETNE:
-      Opcode = AMDGPU::V_CMPX_LG_F32_e64;
-      break;
-    case ISD::SETO:
-      Opcode = AMDGPU::V_CMPX_O_F32_e64;
-      break;
-    case ISD::SETUO:
-      Opcode = AMDGPU::V_CMPX_U_F32_e64;
-      break;
-    case ISD::SETUEQ:
-      Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
-      break;
-    case ISD::SETUGT:
-      Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
-      break;
-    case ISD::SETUGE:
-      Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
-      break;
-    case ISD::SETULT:
-      Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
-      break;
-    case ISD::SETULE:
-      Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
-      break;
-    case ISD::SETUNE:
-      Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
-      break;
-    default:
-      llvm_unreachable("invalid ISD:SET cond code");
-    }
-
-    const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
-    if (ST.hasNoSdstCMPX())
-      Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
-
-    assert(MI.getOperand(0).isReg());
-
-    if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
-                    MI.getOperand(0).getReg())) {
-      Opcode = AMDGPU::getVOPe32(Opcode);
-      BuildMI(MBB, &MI, DL, TII->get(Opcode))
-          .add(MI.getOperand(1))
-          .add(MI.getOperand(0));
-    } else {
-      auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
-      if (!ST.hasNoSdstCMPX())
-        I.addReg(AMDGPU::VCC, RegState::Define);
-
-      I.addImm(0)  // src0 modifiers
-        .add(MI.getOperand(1))
-        .addImm(0)  // src1 modifiers
-        .add(MI.getOperand(0));
-
-      I.addImm(0);  // omod
-    }
-    return true;
-  }
-  case AMDGPU::SI_KILL_I1_TERMINATOR: {
-    const MachineFunction *MF = MI.getParent()->getParent();
-    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    const MachineOperand &Op = MI.getOperand(0);
-    int64_t KillVal = MI.getOperand(1).getImm();
-    assert(KillVal == 0 || KillVal == -1);
-
-    // Kill all threads if Op0 is an immediate and equal to the Kill value.
-    if (Op.isImm()) {
-      int64_t Imm = Op.getImm();
-      assert(Imm == 0 || Imm == -1);
-
-      if (Imm == KillVal) {
-        BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
-                                                     : AMDGPU::S_MOV_B64), Exec)
-          .addImm(0);
-        return true;
-      }
-      return false;
-    }
-
-    unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
-    if (ST.isWave32())
-      Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
-    BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
-        .addReg(Exec)
-        .add(Op);
-    return true;
-  }
-  default:
-    llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
-  }
-}
-
 void SIInsertSkips::earlyTerm(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc DL = MI.getDebugLoc();
@@ -415,7 +233,9 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
   MDT = &getAnalysis<MachineDominatorTree>();
   SkipThreshold = SkipThresholdFlag;
 
-  SmallVector<MachineInstr *, 4> KillInstrs;
+  MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+  ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
   SmallVector<MachineInstr *, 4> EarlyTermInstrs;
   bool MadeChange = false;
 
@@ -440,41 +260,6 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
         }
         break;
 
-      case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
-      case AMDGPU::SI_KILL_I1_TERMINATOR: {
-        MadeChange = true;
-        bool CanKill = kill(MI);
-
-        // Check if we can add an early "if exec=0 { end shader }".
-        //
-        // Note that we _always_ do this if it is correct, even if the kill
-        // happens fairly late in the shader, because the null export should
-        // generally still be cheaper than normal export(s).
-        //
-        // TODO: The dominatesAllReachable check is conservative: if the
-        //       dominance is only missing due to _uniform_ branches, we could
-        //       in fact insert the early-exit as well.
-        if (CanKill &&
-            MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
-            dominatesAllReachable(MBB)) {
-          // Mark the instruction for kill-if-dead insertion. We delay this
-          // change because it modifies the CFG.
-          KillInstrs.push_back(&MI);
-        } else {
-          MI.eraseFromParent();
-        }
-        break;
-      }
-
-      case AMDGPU::SI_KILL_CLEANUP:
-        if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
-            dominatesAllReachable(MBB)) {
-          KillInstrs.push_back(&MI);
-        } else {
-          MI.eraseFromParent();
-        }
-        break;
-
       case AMDGPU::SI_EARLY_TERMINATE_SCC0:
         EarlyTermInstrs.push_back(&MI);
         break;
@@ -491,12 +276,6 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
       earlyTerm(*Instr);
     Instr->eraseFromParent();
   }
-  for (MachineInstr *Kill : KillInstrs) {
-    skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
-               Kill->getDebugLoc());
-    Kill->eraseFromParent();
-  }
-  KillInstrs.clear();
   EarlyTermInstrs.clear();
   EarlyExitBlock = nullptr;
 
 
@@ -1641,6 +1641,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(AMDGPU::S_ANDN2_B32));
     break;
 
+  case AMDGPU::S_AND_B64_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_B64));
+    break;
+
+  case AMDGPU::S_AND_B32_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_B32));
+    break;
+
   case AMDGPU::V_MOV_B64_PSEUDO: {
     Register Dst = MI.getOperand(0).getReg();
     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -2272,10 +2284,12 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
     case AMDGPU::S_XOR_B64_term:
     case AMDGPU::S_OR_B64_term:
     case AMDGPU::S_ANDN2_B64_term:
+    case AMDGPU::S_AND_B64_term:
     case AMDGPU::S_MOV_B32_term:
     case AMDGPU::S_XOR_B32_term:
     case AMDGPU::S_OR_B32_term:
     case AMDGPU::S_ANDN2_B32_term:
+    case AMDGPU::S_AND_B32_term:
       break;
     case AMDGPU::SI_IF:
     case AMDGPU::SI_ELSE:
 
@@ -232,13 +232,15 @@ def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
 def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
 def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
 def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
+def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
 }
 
 let WaveSizePredicate = isWave32 in {
 def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
 def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
 def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
 def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
+def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
 }
 
 
@@ -339,24 +341,22 @@ multiclass PseudoInstKill <dag ins> {
   // required in degenerate cases (when V_CMPX cannot be used due to constant
   // bus limitations) and because it allows us to avoid having to track SCC
   // liveness across basic blocks.
-  let Defs = [EXEC,VCC,SCC] in
+  let Defs = [EXEC,SCC] in
   def _PSEUDO : PseudoInstSI <(outs), ins> {
     let isConvergent = 1;
     let usesCustomInserter = 1;
   }
 
-  let Defs = [EXEC,VCC,SCC] in
+  let Defs = [EXEC,SCC] in
   def _TERMINATOR : SPseudoInstSI <(outs), ins> {
     let isTerminator = 1;
   }
 }
 
 defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
+let Defs = [VCC] in
 defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
 
-let Defs = [EXEC] in
-def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
-
 let Defs = [EXEC,VCC] in
 def SI_ILLEGAL_COPY : SPseudoInstSI <
   (outs unknown:$dst), (ins unknown:$src),