llvm
diff --git a/‎llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp‎
Lines changed: 15 additions & 31 deletions b/‎llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp‎
Lines changed: 15 additions & 31 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp‎
Lines changed: 85 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstrInfo.cpp‎
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIInstrInfo.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstructions.td‎
Lines changed: 2 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/SIInstructions.td‎
Lines changed: 2 additions & 2 deletions
@@ -306,42 +306,26 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
 
 /// Close the last opened control flow
 bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
-  llvm::Loop *L = LI->getLoopFor(BB);
 
   assert(Stack.back().first == BB);
 
-  if (L && L->getHeader() == BB) {
-    // We can't insert an EndCF call into a loop header, because it will
-    // get executed on every iteration of the loop, when it should be
-    // executed only once before the loop.
-    SmallVector <BasicBlock *, 8> Latches;
-    L->getLoopLatches(Latches);
-
-    SmallVector<BasicBlock *, 2> Preds;
-    for (BasicBlock *Pred : predecessors(BB)) {
-      if (!is_contained(Latches, Pred))
-        Preds.push_back(Pred);
-    }
-
-    BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
-                                false);
-  }
-
   Value *Exec = popSaved();
-  BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt();
-  if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
-    Instruction *ExecDef = cast<Instruction>(Exec);
-    BasicBlock *DefBB = ExecDef->getParent();
-    if (!DT->dominates(DefBB, BB)) {
-      // Split edge to make Def dominate Use
-      FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
+  Instruction *ExecDef = dyn_cast<Instruction>(Exec);
+  BasicBlock *DefBB = ExecDef->getParent();
+  for (auto Pred : predecessors(BB)) {
+    llvm::Loop *L = LI->getLoopFor(Pred);
+    bool IsLoopLatch = false;
+    if (L) {
+      SmallVector<BasicBlock *, 4> LL;
+      L->getLoopLatches(LL);
+      IsLoopLatch = std::find_if(LL.begin(), LL.end(), [Pred](BasicBlock *B) {
+                      return B == Pred;
+                    }) != LL.end();
+    }
+    if (Pred != DefBB && DT->dominates(DefBB, Pred) && !IsLoopLatch) {
+      BasicBlock::iterator InsPt(Pred->getTerminator());
+      IRBuilder<>(Pred, InsPt).CreateCall(EndCf, {Exec});
     }
-    IRBuilder<> IRB(FirstInsertionPt->getParent(), FirstInsertionPt);
-    // TODO: StructurizeCFG 'Flow' blocks have debug locations from the
-    // condition, for now just avoid copying these DebugLocs so that stepping
-    // out of the then/else block in a debugger doesn't step to the condition.
-    IRB.SetCurrentDebugLocation(DebugLoc());
-    IRB.CreateCall(EndCf, {Exec});
   }
 
   return true;
 
@@ -15740,6 +15740,91 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
     }
   }
 
+  // ISel inserts copy to regs for the successor PHIs
+  // at the BB end. We need to move the SI_END_CF right before the branch.
+  // Even we don't have to move SI_END_CF we need to take care of the
+  // S_CBRANCH_SCC0/1 as SI_END_CF overwrites SCC
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (MI.getOpcode() == AMDGPU::SI_END_CF) {
+        MachineBasicBlock::iterator I(MI);
+        MachineBasicBlock::iterator Next = std::next(I);
+        bool NeedToMove = false;
+        while (Next != MBB.end() && !Next->isBranch()) {
+          NeedToMove = true;
+          Next++;
+        }
+
+        // Lets take care of SCC users as S_END_CF defines SCC
+        bool NeedPreserveSCC =
+            Next != MBB.end() && Next->readsRegister(AMDGPU::SCC);
+        MachineBasicBlock::iterator SCCDefUse(Next);
+        // This loop will be never taken as we always have S_CBRANCH_SCC1/0 at
+        // the end of the block.
+        while (!NeedPreserveSCC && SCCDefUse != MBB.end()) {
+          if (SCCDefUse->definesRegister(AMDGPU::SCC))
+            // This should never happen - SCC def after the branch reading SCC
+            break;
+          if (SCCDefUse->readsRegister(AMDGPU::SCC)) {
+            NeedPreserveSCC = true;
+            break;
+          }
+          SCCDefUse++;
+        }
+        if (NeedPreserveSCC) {
+          MachineBasicBlock::reverse_iterator BackSeeker(Next);
+          while (BackSeeker != MBB.rend()) {
+            if (BackSeeker != MI && BackSeeker->definesRegister(AMDGPU::SCC))
+              break;
+            BackSeeker++;
+          }
+          // we need this to makes some artificial MIR tests happy
+          bool NeedSetSCCUndef = false;
+          if (BackSeeker == MBB.rend()) {
+            // We have reached the begin of the block but haven't seen the SCC
+            // def Given that the MIR is correct, we either have SCC live in
+            // or SCCUser SCC operand is undef. In fact, we don't need to emit
+            // the instructions that preserve thje SCC if the use is Undef. We
+            // do this just because the MIR looks weird otherwise.
+            MachineOperand *SCCUseOp =
+                SCCDefUse->findRegisterUseOperand(AMDGPU::SCC, false, TRI);
+            assert(SCCUseOp);
+            bool IsSCCLiveIn = MBB.isLiveIn(AMDGPU::SCC);
+            bool IsUseUndef = SCCUseOp->isUndef();
+            NeedSetSCCUndef = (!IsSCCLiveIn && IsUseUndef);
+          }
+          MachineBasicBlock::iterator InsPt(BackSeeker);
+          Register SavedSCC =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+          MachineInstr *SaveSCC =
+              BuildMI(MBB, InsPt, InsPt->getDebugLoc(),
+                      TII->get(AMDGPU::S_CSELECT_B32), SavedSCC)
+                  .addImm(1)
+                  .addImm(0);
+          if (NeedSetSCCUndef) {
+
+            MachineOperand *SCCOp =
+                SaveSCC->findRegisterUseOperand(AMDGPU::SCC, false, TRI);
+            if (SCCOp)
+              SCCOp->setIsUndef();
+          }
+          Register Tmp =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+          Next = BuildMI(MBB, Next, Next->getDebugLoc(),
+                         TII->get(AMDGPU::S_AND_B32_term), Tmp)
+                     .addReg(SavedSCC)
+                     .addImm(1);
+        }
+
+        if (NeedToMove) {
+          MBB.splice(Next, &MBB, &MI);
+        }
+
+        break;
+      }
+    }
+  }
+
   // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
   // classes if required. Ideally the register class constraints would differ
   // per-subtarget, but there's no easy way to achieve that right now. This is
 
@@ -3102,6 +3102,7 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
       break;
     case AMDGPU::SI_IF:
     case AMDGPU::SI_ELSE:
+    case AMDGPU::SI_END_CF:
     case AMDGPU::SI_KILL_I1_TERMINATOR:
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       // FIXME: It's messy that these need to be considered here at all.
 
@@ -475,8 +475,6 @@ def SI_LOOP : CFPseudoInstSI <
   let IsNeverUniform = 1;
 }
 
-} // End isTerminator = 1
-
 def SI_END_CF : CFPseudoInstSI <
   (outs), (ins SReg_1:$saved), [], 1, 1> {
   let Size = 4;
@@ -488,6 +486,8 @@ def SI_END_CF : CFPseudoInstSI <
   let mayStore = 1;
 }
 
+} // End isTerminator = 1
+
 def SI_IF_BREAK : CFPseudoInstSI <
   (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
   let Size = 4;